#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <cuda.h>
#include "BitmapStruc.h"

#include <stdio.h>

#define DIM 512
#define DIM_BLOCK 16

#define PI 3.1415926535897932

__global__ void Kernel(unsigned char* ptr, int w, int h)
{
	float x = blockDim.x * blockIdx.x + threadIdx.x;
	float y = blockDim.y * blockIdx.y + threadIdx.y;
	int index = y * w + x;

	__shared__ float shared[DIM_BLOCK][DIM_BLOCK];

	if(x>=w || y>=h)
		return;

	shared[threadIdx.x][threadIdx.y] = 255 * sinf(threadIdx.x * PI / DIM_BLOCK) * sinf(threadIdx.y * PI / DIM_BLOCK);

	__syncthreads();

	ptr[4 * index + 0] = shared[DIM_BLOCK - 1 - threadIdx.x][DIM_BLOCK - 1 - threadIdx.y];
	ptr[4 * index + 1] = shared[DIM_BLOCK - 1 - threadIdx.x][DIM_BLOCK - 1 - threadIdx.y];
	ptr[4 * index + 2] = shared[DIM_BLOCK - 1 - threadIdx.x][DIM_BLOCK - 1 - threadIdx.y];
	ptr[4 * index + 3] = 255;

	return;
}

int main()
{
    BitmapStruct image(DIM, DIM);

	unsigned char* dev_ptr;

	cudaMalloc((void**)&dev_ptr, image.image_size());

	dim3 grid((DIM + DIM_BLOCK - 1)/DIM_BLOCK, (DIM + DIM_BLOCK - 1)/DIM_BLOCK);
	dim3 block(DIM_BLOCK, DIM_BLOCK);

	Kernel<<<grid, block>>>(dev_ptr, DIM, DIM);

	cudaMemcpy(image.pixels, dev_ptr, image.image_size(), cudaMemcpyDeviceToHost);

	cudaFree(dev_ptr);

	image.displayImage();

	return 0;
}