#include "cuda_runtime.h" #include "device_launch_parameters.h" #include #include using namespace std; #define N 1000 #define BLOCKDIM 256 #define GRIDDIM 32 __global__ void dotKernel(float* a, float* b, float* partialC, int elementNum); int main(int argc, char** argv) { int i; float* a = new float[N]; float* b = new float[N]; float* partialC = new float[GRIDDIM]; float result; for (i = 0; i> >(dev_a, dev_b, dev_partialC, N); cudaMemcpy(partialC, dev_partialC, GRIDDIM * sizeof(float), cudaMemcpyDeviceToHost); cudaFree(dev_a); cudaFree(dev_b); cudaFree(dev_partialC); result = 0; for (i = 0; i < GRIDDIM; i++) { result += partialC[i]; } cout << "Result is: " << result << endl; return 0; } __global__ void dotKernel(float* a, float* b, float* partialC, int elementNum) { __shared__ float cache[BLOCKDIM]; int tid = blockIdx.x * blockDim.x + threadIdx.x; int offset = gridDim.x * blockDim.x; float result = 0.0f; for (; tid < elementNum; tid += offset) { result += a[tid] * b[tid]; } cache[threadIdx.x] = result; __syncthreads(); int i = blockDim.x / 2; int cacheIndex = threadIdx.x; while (i != 0) { if (cacheIndex < i) cache[cacheIndex] += cache[cacheIndex + i]; __syncthreads(); i /= 2; } if (threadIdx.x == 0) partialC[blockIdx.x] = cache[0]; return; }