#include "cuda_runtime.h" #include "device_launch_parameters.h" #include #include "BitmapStruc.h" #include #define DIM 512 #define DIM_BLOCK 16 #define PI 3.1415926535897932 __global__ void Kernel(unsigned char* ptr, int w, int h) { float x = blockDim.x * blockIdx.x + threadIdx.x; float y = blockDim.y * blockIdx.y + threadIdx.y; int index = y * w + x; __shared__ float shared[DIM_BLOCK][DIM_BLOCK]; if(x>=w || y>=h) return; shared[threadIdx.x][threadIdx.y] = 255 * sinf(threadIdx.x * PI / DIM_BLOCK) * sinf(threadIdx.y * PI / DIM_BLOCK); __syncthreads(); ptr[4 * index + 0] = shared[DIM_BLOCK - 1 - threadIdx.x][DIM_BLOCK - 1 - threadIdx.y]; ptr[4 * index + 1] = shared[DIM_BLOCK - 1 - threadIdx.x][DIM_BLOCK - 1 - threadIdx.y]; ptr[4 * index + 2] = shared[DIM_BLOCK - 1 - threadIdx.x][DIM_BLOCK - 1 - threadIdx.y]; ptr[4 * index + 3] = 255; return; } int main() { BitmapStruct image(DIM, DIM); unsigned char* dev_ptr; cudaMalloc((void**)&dev_ptr, image.image_size()); dim3 grid((DIM + DIM_BLOCK - 1)/DIM_BLOCK, (DIM + DIM_BLOCK - 1)/DIM_BLOCK); dim3 block(DIM_BLOCK, DIM_BLOCK); Kernel<<>>(dev_ptr, DIM, DIM); cudaMemcpy(image.pixels, dev_ptr, image.image_size(), cudaMemcpyDeviceToHost); cudaFree(dev_ptr); image.displayImage(); return 0; }