《GPU高性能编程CUDA实战》学习笔记(七)


第七章 纹理内存

Texture Memory, 与常量内存一样,纹理内存是另一种只读内存,在特定的访问模式中,纹理内存同样能提升性能并减少内存流量。

7.1 本章目标

  • 了解纹理内存的性能特性
  • 了解如何在CUDA C中使用一维纹理内存
  • 了解如何在CUDA C中使用二维纹理内存

7.2 纹理内存简介

虽然NVIDIA为OpenGL和DirectX等的渲染流水线都设计了纹理单元,但纹理内存具备的一些属性使其在计算中变得非常有用。
与常量内存类似的是,纹理内存同样缓存在芯片上,因此在某些情况中,它能减少对内存的请求并提供更高效率的内存带宽。纹理内存是专门为那些在内存访问模式中存在大量空间局部性(Spatial Locality)的图形应用程序而设计的。在某个计算应用程序中,这意味着一个线程读取的位置可能与邻近线程读取的位置“非常接近”。

7.3 热传导模拟

物理模拟问题,这类问题通常在计算精度计算复杂性上存在着某种权衡。

7.3.1 简单的传热模型

构造一个简单的二维热传导模拟。首先假设有一个矩形房间,将其分成一个格网。在格网中随机散布一些“热源”,它们有着不同的固定温度。
我们可以计算格网中每个单元的温度随时间的变化情况。为了简单,假设热源单元本身的温度将保持不变。在时间递进的每个步骤中,我们假设热量在某个单元及其邻接单元之间“流动”。如果某个单元的邻接单元的温度更高,那么热量将从邻接单元传导到该单元。相反地,如果某个单元的邻接单元的温度更高,那么它将变冷。

在热传导模型中,单元新温度计算方法:将单元邻接单元的温差相加起来,然后加上原有温度,公式如下,
常量k: 模拟过程中热量的流动速率。k值越大,表示系统会更快的达到稳定温度,否则越慢。我们只考虑4个邻接单元(上, 下, 左, 右)并且等式中的k和Told都是常数,因此等式7.1展开后如等式7.2所示。
我们来看看GPU是如何实现等式7.2的更新。

7.3.2 温度更新的计算

首先给出更新流程的基本介绍:1) 给定一个包含初始输入温度的格网,将其作为热源的单元温度值复制到格网相应的单元中。这将覆盖这些单元之前计算的温度,因此也就确保了“加热单元将保持恒温”这个条件。这个复制操作是在copy_const_kernel()中执行的。2)给定一个输入温度格网,根据等式7.2中的更新公式计算输出温度格网。这个更新操作是在blend_kernel()中执行的。3)将输入温度格网和输出温度格网交换,为下一个步骤的计算做好准备。当模拟下一个时间步时,在步骤2中计算得到得到输出温度格网将成为步骤1中的输入温度格网。
我们假设已经获得了一个格网,格网中大多数单元的温度值都是0,但有些单元包含了非0的温度值,这些单元就是拥有固定温度的热源。在模拟过程中,缓冲区中的这些常量值不会发生变化,并且在每个时间步中读取。
__global__ void copy_const_kernel( float *iptr, const float* cptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

if (cptr[offset] != 0)iptr[offset] = cptr[offset];
}

__global__ void blend_kernel( float *outSrc,
const float *inSrc ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

int left = offset - 1;
int right = offset + 1;
if (x == 0) left++;
if (x == DIM-1) right--;

int top = offset - DIM;
int bottom = offset + DIM;
if (y == 0) top += DIM;
if (y == DIM-1) bottom -= DIM;

outSrc[offset] = inSrc[offset] + SPEED * (inSrc[top] +
inSrc[bottom] + inSrc[left] + inSrc[right] -
inSrc[offset]*4);
}

7.3.3 模拟过程动态演示

#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_anim.h"

#define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f

// these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut;



// this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel( float *dst,
bool dstOut ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

int left = offset - 1;
int right = offset + 1;
if (x == 0) left++;
if (x == DIM-1) right--;

int top = offset - DIM;
int bottom = offset + DIM;
if (y == 0) top += DIM;
if (y == DIM-1) bottom -= DIM;

float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn,top);
l = tex1Dfetch(texIn,left);
c = tex1Dfetch(texIn,offset);
r = tex1Dfetch(texIn,right);
b = tex1Dfetch(texIn,bottom);

} else {
t = tex1Dfetch(texOut,top);
l = tex1Dfetch(texOut,left);
c = tex1Dfetch(texOut,offset);
r = tex1Dfetch(texOut,right);
b = tex1Dfetch(texOut,bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}

// NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel( float *iptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

float c = tex1Dfetch(texConstSrc,offset);
if (c != 0)
iptr[offset] = c;
}

// globals needed by the update routine
struct DataBlock {
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;

cudaEvent_t start, stop;
float totalTime;
float frames;
};

void anim_gpu( DataBlock *d, int ticks ) {
HANDLE_ERROR( cudaEventRecord( d->start, 0 ) );
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
CPUAnimBitmap *bitmap = d->bitmap;

// since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i=0; i<90; i++) {
float *in, *out;
if (dstOut) {
in = d->dev_inSrc;
out = d->dev_outSrc;
} else {
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel<<<blocks,threads>>>( in );
blend_kernel<<<blocks,threads>>>( out, dstOut );
dstOut = !dstOut;
}
float_to_color<<<blocks,threads>>>( d->output_bitmap,
d->dev_inSrc );

HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(),
d->output_bitmap,
bitmap->image_size(),
cudaMemcpyDeviceToHost ) );

HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( d->stop ) );
float elapsedTime;
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
d->start, d->stop ) );
d->totalTime += elapsedTime;
++d->frames;
printf( "Average Time per frame: %3.1f ms\n",
d->totalTime/d->frames );
}

// clean up memory allocated on the GPU
void anim_exit( DataBlock *d ) {
cudaUnbindTexture( texIn );
cudaUnbindTexture( texOut );
cudaUnbindTexture( texConstSrc );
HANDLE_ERROR( cudaFree( d->dev_inSrc ) );
HANDLE_ERROR( cudaFree( d->dev_outSrc ) );
HANDLE_ERROR( cudaFree( d->dev_constSrc ) );

HANDLE_ERROR( cudaEventDestroy( d->start ) );
HANDLE_ERROR( cudaEventDestroy( d->stop ) );
}


int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
data.totalTime = 0;
data.frames = 0;
HANDLE_ERROR( cudaEventCreate( &data.start ) );
HANDLE_ERROR( cudaEventCreate( &data.stop ) );

int imageSize = bitmap.image_size();

HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap,
imageSize ) );

// assume float == 4 chars in size (ie rgba)
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc,
imageSize ) );
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc,
imageSize ) );
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc,
imageSize ) );

HANDLE_ERROR( cudaBindTexture( NULL, texConstSrc,
data.dev_constSrc,
imageSize ) );

HANDLE_ERROR( cudaBindTexture( NULL, texIn,
data.dev_inSrc,
imageSize ) );

HANDLE_ERROR( cudaBindTexture( NULL, texOut,
data.dev_outSrc,
imageSize ) );

// intialize the constant data
float *temp = (float*)malloc( imageSize );
for (int i=0; i<DIM*DIM; i++) {
temp[i] = 0;
int x = i % DIM;
int y = i / DIM;
if ((x>300) && (x<600) && (y>310) && (y<601))
temp[i] = MAX_TEMP;
}
temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2;
temp[DIM*700+100] = MIN_TEMP;
temp[DIM*300+300] = MIN_TEMP;
temp[DIM*200+700] = MIN_TEMP;
for (int y=800; y<900; y++) {
for (int x=400; x<500; x++) {
temp[x+y*DIM] = MIN_TEMP;
}
}
HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp,
imageSize,
cudaMemcpyHostToDevice ) );

// initialize the input data
for (int y=800; y<DIM; y++) {
for (int x=0; x<200; x++) {
temp[x+y*DIM] = MAX_TEMP;
}
}
HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp,
imageSize,
cudaMemcpyHostToDevice ) );
free( temp );

bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu,
(void (*)(void*))anim_exit );
}
90: 实验得出的,能避免在每隔时间步中都需复制一张位图图片,又可避免在每一帧计算过多时间步。

7.3.4 使用纹理内存

浮点类型纹理的引用
// these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut;

将这些变量绑定到内存缓冲区,
  • 将指定的缓冲区作为纹理来使用。
  • 将纹理引用作为纹理的“名字”
分配内存
    HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc,
imageSize ) );
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc,
imageSize ) );
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc,
imageSize ) );
绑定到内存
HANDLE_ERROR( cudaBindTexture( NULL, texConstSrc,
data.dev_constSrc,
imageSize ) );

HANDLE_ERROR( cudaBindTexture( NULL, texIn,
data.dev_inSrc,
imageSize ) );

HANDLE_ERROR( cudaBindTexture( NULL, texOut,
data.dev_outSrc,
imageSize ) );
tex1Dfetch(), 告诉GPU将读取请求转发到纹理内存而不是标准全局内存,它是一个编译器内置函数(Intrinsic)。
纹理引用必须声明为文件作用域内的全局变量,

释放缓存
// clean up memory allocated on the GPU
void anim_exit( DataBlock *d ) {
cudaUnbindTexture( texIn );
cudaUnbindTexture( texOut );
cudaUnbindTexture( texConstSrc );
HANDLE_ERROR( cudaFree( d->dev_inSrc ) );
HANDLE_ERROR( cudaFree( d->dev_outSrc ) );
HANDLE_ERROR( cudaFree( d->dev_constSrc ) );

HANDLE_ERROR( cudaEventDestroy( d->start ) );
HANDLE_ERROR( cudaEventDestroy( d->stop ) );
}

7.3.5 使用二维纹理内存

声明二维纹理引用
// these exist on the GPU side
texture<float,2> texConstSrc;
texture<float,2> texIn;
texture<float,2> texOut;

全部代码如下,
#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_anim.h"

#define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f

// these exist on the GPU side
texture<float,2> texConstSrc;
texture<float,2> texIn;
texture<float,2> texOut;

__global__ void blend_kernel( float *dst,
bool dstOut ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

float t, l, c, r, b;
if (dstOut) {
t = tex2D(texIn,x,y-1);
l = tex2D(texIn,x-1,y);
c = tex2D(texIn,x,y);
r = tex2D(texIn,x+1,y);
b = tex2D(texIn,x,y+1);
} else {
t = tex2D(texOut,x,y-1);
l = tex2D(texOut,x-1,y);
c = tex2D(texOut,x,y);
r = tex2D(texOut,x+1,y);
b = tex2D(texOut,x,y+1);
}
dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}

__global__ void copy_const_kernel( float *iptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;

float c = tex2D(texConstSrc,x,y);
if (c != 0)
iptr[offset] = c;
}

// globals needed by the update routine
struct DataBlock {
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;

cudaEvent_t start, stop;
float totalTime;
float frames;
};

void anim_gpu( DataBlock *d, int ticks ) {
HANDLE_ERROR( cudaEventRecord( d->start, 0 ) );
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
CPUAnimBitmap *bitmap = d->bitmap;

// since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i=0; i<90; i++) {
float *in, *out;
if (dstOut) {
in = d->dev_inSrc;
out = d->dev_outSrc;
} else {
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel<<<blocks,threads>>>( in );
blend_kernel<<<blocks,threads>>>( out, dstOut );
dstOut = !dstOut;
}
float_to_color<<<blocks,threads>>>( d->output_bitmap,
d->dev_inSrc );

HANDLE_ERROR( cudaMemcpy( bitmap->get_ptr(),
d->output_bitmap,
bitmap->image_size(),
cudaMemcpyDeviceToHost ) );

HANDLE_ERROR( cudaEventRecord( d->stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( d->stop ) );
float elapsedTime;
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
d->start, d->stop ) );
d->totalTime += elapsedTime;
++d->frames;
printf( "Average Time per frame: %3.1f ms\n",
d->totalTime/d->frames );
}

// clean up memory allocated on the GPU
void anim_exit( DataBlock *d ) {
cudaUnbindTexture( texIn );
cudaUnbindTexture( texOut );
cudaUnbindTexture( texConstSrc );
HANDLE_ERROR( cudaFree( d->dev_inSrc ) );
HANDLE_ERROR( cudaFree( d->dev_outSrc ) );
HANDLE_ERROR( cudaFree( d->dev_constSrc ) );

HANDLE_ERROR( cudaEventDestroy( d->start ) );
HANDLE_ERROR( cudaEventDestroy( d->stop ) );
}


int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
data.totalTime = 0;
data.frames = 0;
HANDLE_ERROR( cudaEventCreate( &data.start ) );
HANDLE_ERROR( cudaEventCreate( &data.stop ) );

int imageSize = bitmap.image_size();

HANDLE_ERROR( cudaMalloc( (void**)&data.output_bitmap,
imageSize ) );

// assume float == 4 chars in size (ie rgba)
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_inSrc,
imageSize ) );
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_outSrc,
imageSize ) );
HANDLE_ERROR( cudaMalloc( (void**)&data.dev_constSrc,
imageSize ) );

cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
HANDLE_ERROR( cudaBindTexture2D( NULL, texConstSrc,
data.dev_constSrc,
desc, DIM, DIM,
sizeof(float) * DIM ) );

HANDLE_ERROR( cudaBindTexture2D( NULL, texIn,
data.dev_inSrc,
desc, DIM, DIM,
sizeof(float) * DIM ) );

HANDLE_ERROR( cudaBindTexture2D( NULL, texOut,
data.dev_outSrc,
desc, DIM, DIM,
sizeof(float) * DIM ) );

// initialize the constant data
float *temp = (float*)malloc( imageSize );
for (int i=0; i<DIM*DIM; i++) {
temp[i] = 0;
int x = i % DIM;
int y = i / DIM;
if ((x>300) && (x<600) && (y>310) && (y<601))
temp[i] = MAX_TEMP;
}
temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2;
temp[DIM*700+100] = MIN_TEMP;
temp[DIM*300+300] = MIN_TEMP;
temp[DIM*200+700] = MIN_TEMP;
for (int y=800; y<900; y++) {
for (int x=400; x<500; x++) {
temp[x+y*DIM] = MIN_TEMP;
}
}
HANDLE_ERROR( cudaMemcpy( data.dev_constSrc, temp,
imageSize,
cudaMemcpyHostToDevice ) );

// initialize the input data
for (int y=800; y<DIM; y++) {
for (int x=0; x<200; x++) {
temp[x+y*DIM] = MAX_TEMP;
}
}
HANDLE_ERROR( cudaMemcpy( data.dev_inSrc, temp,
imageSize,
cudaMemcpyHostToDevice ) );
free( temp );

bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu,
(void (*)(void*))anim_exit );
}


tex2D() : 不需要担心溢出问题,代码更加简单
cudaCreateChannelDesc() : 对通道格式描述符(Channel Format Description)
取消绑定纹理引用的函数一样。
如果使用纹理采样器(Texture Sampler)自动执行某种转换,还会有额外的加速。
智能推荐

注意!

本站转载的文章为个人学习借鉴使用,本站对版权不负任何法律责任。如果侵犯了您的隐私权益,请联系我们删除。



 
© 2014-2019 ITdaan.com 粤ICP备14056181号  

赞助商广告