#include "cuda_runtime.h" #include "device_launch_parameters.h" #include "math_constants.h" #include "ImageIO.h" #include #include using namespace std; #define N 512 #define BLOCKDIM 32 #define PI 3.1415926535897932 __global__ void kernel(unsigned char* result, int w, int h); int main(int argc, char** argv) { unsigned char* result = new unsigned char[N*N * 4]; unsigned char* dev_result; cudaMalloc((void**)&dev_result, N*N * 4 * sizeof(unsigned char)); dim3 blockDim = dim3(BLOCKDIM, BLOCKDIM, 1); dim3 gridDim = dim3((N + BLOCKDIM - 1) / BLOCKDIM, (N + BLOCKDIM - 1) / BLOCKDIM, 1); kernel << > > (dev_result, N, N); cudaMemcpy(result, dev_result, N*N * 4 * sizeof(unsigned char), cudaMemcpyDeviceToHost); writeRGBImageToFile("image.png", result, N, N); return 0; } __global__ void kernel(unsigned char* result, int w, int h) { int x = blockDim.x *blockIdx.x + threadIdx.x; int y = blockDim.y *blockIdx.y + threadIdx.y; int tid = (y*w + x) * 4; __shared__ float shared[BLOCKDIM][BLOCKDIM]; if (x >= w || y >= h) return; shared[threadIdx.x][threadIdx.y] = 0; __syncthreads(); shared[threadIdx.x][threadIdx.y] = 255 * sinf(threadIdx.x * PI / blockDim.x) * sinf(threadIdx.y * PI / blockDim.y); //__syncthreads(); result[tid + 1] = shared[threadIdx.y][threadIdx.x]; result[tid + 0] = shared[threadIdx.y][threadIdx.x]; result[tid + 2] = shared[threadIdx.y][threadIdx.x]; result[tid + 3] = 255; return; }