#include <Windows.h>
#include <stdio.h>
#include <stdlib.h>
#include <CL\cl.h>


// OpenCL source Code
const char* OpenCLSource[] = {
	"__kernel void VectorAdd(__global int* a,__global int* b)",
	"{",
	"        // Index of the elements to add \n",
	"        unsigned int n = get_global_id(0);",
	"        c[n] = a[n] + b[n];",
	"}"
};

// Some interesting data for the vectors
int InitialData1[20] = {37,50,54,50,56,0,43,43,74,71,32,36,16,43,56,100,50,25,15,17};
int InitialData2[20] = {35,51,54,58,55,32,36,69,27,39,35,40,16,44,55,14,58,75,18,15};

// Number of elements in the vectors to be added
#define SIZE 2048

// Main function
int main(int argc, char **argv)
{
	// Two integer source vectors in Host memory
	int HostVector1[SIZE], HostVector2[SIZE];

	// Initialize with some interesting repeating data
	for(int c = 0; c < SIZE; c++)
	{
		HostVector1[c] = InitialData1[c%20];
		HostVector2[c] = InitialData2[c%20];
	}

	// Create a context to run OpenCL on our CUDA-enabled NVIDIA GPU
	cl_context GPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, NULL);

	// Get the list of GPU devices associated with this context
	size_t ParmDataBytes;
	clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, 0, NULL, &ParmDataBytes);
	cl_device_id* GPUDevices = (cl_device_id*)malloc(ParmDataBytes);
	clGetContextInfo(GPUContext, CL_CONTEXT_DEVICES, ParmDataBytes, GPUDevices, NULL);

	// Create a command-queue on the first GPU device
	cl_command_queue GPUCommandQueue = clCreateCommandQueue(GPUContext, GPUDevices[0], 0, NULL);

	// Allocate GPU memory for source vectors AND initalize from CPU memory
	cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector1, NULL); 
	cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int) * SIZE, HostVector2, NULL); 

	// Alocate output memory on GPU
	cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY, sizeof(int) * SIZE, NULL, NULL);
	
	// Create OpenCL program with source code
	cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 7, OpenCLSource, NULL, NULL);

	// Build the program (OpenCL JIT compilation)
	clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL);

	// Create a handle to the compiled OpenCL function (Kernel)
	cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "VectorAdd", NULL);

	// In the next step we associate the GPU memory with the Kernel arguments
	clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector);
	clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1);
	clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2);

	// Launch the Kernel on the GPU
	size_t WorkSize[1] = {SIZE}; // one dimensional Range
	clEnqueueNDRangeKernel(GPUCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL);


	// Copy the output in GPU memory back to CPU memory
	int HostOutputVector[SIZE];
	clEnqueueReadBuffer(GPUCommandQueue, GPUOutputVector, CL_TRUE, 0, SIZE * sizeof(int), HostOutputVector, 0, NULL, NULL);

	// Cleanup
	free(GPUDevices);
	clReleaseKernel(OpenCLVectorAdd);
	clReleaseProgram(OpenCLProgram);
	clReleaseCommandQueue(GPUCommandQueue);
	clReleaseContext(GPUContext);
	clReleaseMemObject(GPUVector1);
	clReleaseMemObject(GPUVector2);
	clReleaseMemObject(GPUOutputVector);

	// Print out the results
	for(int Rows = 0; Rows < (SIZE/20); Rows++, printf("\n")){
		for(int c = 0; c <20; c++){
			printf("%c",(char)HostOutputVector[Rows * 20 + c]);
		}
	}

	return 0;
}