Trying to find a way to make this as simd friendly as possible. Perhaps a kernel for calucation and then a kernel to "clean" the back buffer

2016-01-02 19:53:46 -08:00
parent c5d73bd5fd
commit 6334e9c848
6 changed files with 135 additions and 100 deletions
--- a/Conway_OpenCL/Conway.cpp
+++ b/Conway_OpenCL/Conway.cpp
@@ -28,7 +28,7 @@ float elap_time() {
 	return (float)((counter - start) / double(frequency));
 }

-/* convert the kernel file into a string */
+// convert the kernel file into a string
 int convertToString(const char *filename, std::string& s)
 {
 	size_t size;
@@ -63,8 +63,8 @@ int main(int argc, char* argv[])
 {
 	int WINDOW_X = 1000;
 	int WINDOW_Y = 1000;
-	int GRID_WIDTH = 1000;
-	int GRID_HEIGHT = 1000;
+	int GRID_WIDTH = WINDOW_X;
+	int GRID_HEIGHT = WINDOW_Y;
 	int WORKER_SIZE = 2000;

 	// ============================== OpenCL Setup ==================================================================
@@ -139,18 +139,19 @@ int main(int argc, char* argv[])
 	}

 	// Now create the kernel
-	cl_kernel kernel = clCreateKernel(program, "conway", NULL);
+	cl_kernel front_kernel = clCreateKernel(program, "conway", NULL);
+	cl_kernel back_kernel = clCreateKernel(program, "conway", NULL);

 	// ======================================= Setup grid =========================================================

 	// Setup the rng
 	std::mt19937 rng(time(NULL));
-	std::uniform_int_distribution<int> rgen(0, 12); // 25% chance
+	std::uniform_int_distribution<int> rgen(0, 4); // 25% chance

 	// Init the grids
-	unsigned char* front_grid = new unsigned char[GRID_WIDTH * GRID_HEIGHT* 2];
+	unsigned char* front_grid = new unsigned char[GRID_WIDTH * GRID_HEIGHT];

-	for (int i = 0; i < 1000 * 1000; i += 2) {
+	for (int i = 0; i < GRID_WIDTH * GRID_HEIGHT; i++) {
 		if (rgen(rng) == 1) {
 			front_grid[i] = 1;
 		}
@@ -159,31 +160,14 @@ int main(int argc, char* argv[])
 		}
 	}

-	unsigned char* rear_grid = new unsigned char[GRID_WIDTH * GRID_HEIGHT * 2];
+	unsigned char* back_grid = new unsigned char[GRID_WIDTH * GRID_HEIGHT];

 	for (int i = 0; i < GRID_WIDTH * GRID_HEIGHT; i++) {
-		rear_grid[i] = front_grid[i];
+		back_grid[i] = front_grid[i];
 	}

 	// ====================================== Setup SFML ==========================================================

-	sf::Uint8* asdf = rear_grid;
-
-	sf::Uint8* pixel_array = new sf::Uint8[WINDOW_X * WINDOW_Y * 4];
-
-	for (int i = 0; i < GRID_WIDTH * GRID_HEIGHT * 2; i += 2) {
-
-		int p = i / 2;
-
-		pixel_array[p * 4] = 49; // R?
-		pixel_array[p * 4 + 1] = 68; // G?
-		pixel_array[p * 4 + 2] = 72; // B?
-		pixel_array[p * 4 + 3] = 255; // A?
-	}
-
-	char* arr = new char[1000 * 1000];
-
-
 	// Init window, and loop data
 	sf::RenderWindow window(sf::VideoMode(GRID_WIDTH, GRID_HEIGHT), "Classic Games");

@@ -191,26 +175,51 @@ int main(int argc, char* argv[])
 	double frame_time = 0.0, elapsed_time = 0.0, delta_time = 0.0, accumulator_time = 0.0, current_time = 0.0;
 	int frame_count = 0;

+	sf::Uint8* pixel_array = new sf::Uint8[WINDOW_X * WINDOW_Y * 4];

-	int err = 0;
-	cl_mem frontBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, GRID_WIDTH * GRID_HEIGHT * sizeof(char), (void*)front_grid, &err);
-	cl_mem rearBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, GRID_WIDTH * GRID_HEIGHT * sizeof(char), (void*)rear_grid, &err);
-
-	cl_mem workerCountBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &WORKER_SIZE, &err);
-	cl_mem gridWidthBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &GRID_WIDTH, &err);
-	cl_mem gridHeightBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &GRID_HEIGHT, &err);
-
-	status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&frontBuffer);
-	status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&workerCountBuffer);
-	status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&gridWidthBuffer);
-	status = clSetKernelArg(kernel, 3, sizeof(cl_mem), (void *)&gridHeightBuffer);
+	for (int i = 0; i < GRID_WIDTH * GRID_HEIGHT; i++) {

+		pixel_array[i * 4] = 49; // R?
+		pixel_array[i * 4 + 1] = 68; // G?
+		pixel_array[i * 4 + 2] = 72; // B?
+		pixel_array[i * 4 + 3] = 255; // A?
+	}

 	sf::Texture texture;
 	texture.create(WINDOW_X, WINDOW_Y);
 	sf::Sprite sprite(texture);

+	// ========================================= Setup the buffers ==================================================

+	int err = 0;
+
+	cl_mem frontBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, GRID_WIDTH * GRID_HEIGHT * sizeof(char), (void*)front_grid, &err);
+	cl_mem backBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, GRID_WIDTH * GRID_HEIGHT * sizeof(char), (void*)back_grid, &err);
+	cl_mem pixelBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, GRID_WIDTH * GRID_HEIGHT * sizeof(char), (void*)pixel_array, &err);
+
+	cl_mem workerCountBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &WORKER_SIZE, &err);
+	cl_mem gridWidthBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &GRID_WIDTH, &err);
+	cl_mem gridHeightBuffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int), &GRID_HEIGHT, &err);
+
+	// Kernel args for front kernel
+	status = clSetKernelArg(front_kernel, 0, sizeof(cl_mem), (void *)&frontBuffer);
+	status = clSetKernelArg(front_kernel, 1, sizeof(cl_mem), (void *)&backBuffer);
+	status = clSetKernelArg(front_kernel, 2, sizeof(cl_mem), (void *)&pixelBuffer);
+
+	status = clSetKernelArg(front_kernel, 3, sizeof(cl_mem), (void *)&workerCountBuffer);
+	status = clSetKernelArg(front_kernel, 4, sizeof(cl_mem), (void *)&gridWidthBuffer);
+	status = clSetKernelArg(front_kernel, 5, sizeof(cl_mem), (void *)&gridHeightBuffer);
+
+	// Flipped kernel args for the back kernel
+	status = clSetKernelArg(back_kernel, 0, sizeof(cl_mem), (void *)&backBuffer); // Flipped
+	status = clSetKernelArg(back_kernel, 1, sizeof(cl_mem), (void *)&frontBuffer); // Flipped
+	status = clSetKernelArg(back_kernel, 2, sizeof(cl_mem), (void *)&pixelBuffer);
+
+	status = clSetKernelArg(back_kernel, 3, sizeof(cl_mem), (void *)&workerCountBuffer);
+	status = clSetKernelArg(back_kernel, 4, sizeof(cl_mem), (void *)&gridWidthBuffer);
+	status = clSetKernelArg(back_kernel, 5, sizeof(cl_mem), (void *)&gridHeightBuffer);
+
+	bool flipped = false;
 	// ===================================== Loop ==================================================================
 	while (window.isOpen()) {

@@ -241,12 +250,16 @@ int main(int argc, char* argv[])
 		// Work size, for each y line
 		size_t global_work_size[1] = { WORKER_SIZE };

-		// Run the kernel
-		status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
-
-		// Get output, put back into grid
-		status = clEnqueueReadBuffer(commandQueue, frontBuffer, CL_TRUE, 0, GRID_WIDTH * GRID_HEIGHT * sizeof(char), (void*)rear_grid, 0, NULL, NULL);
+		if (flipped) {
+			status = clEnqueueNDRangeKernel(commandQueue, back_kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
+			status = clEnqueueReadBuffer(commandQueue, pixelBuffer, CL_TRUE, 0, GRID_WIDTH * GRID_HEIGHT * 4 * sizeof(unsigned char), (void*)pixel_array, 0, NULL, NULL);
+		}
+		else {
+			status = clEnqueueNDRangeKernel(commandQueue, front_kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL);
+			status = clEnqueueReadBuffer(commandQueue, pixelBuffer, CL_TRUE, 0, GRID_WIDTH * GRID_HEIGHT * 4 * sizeof(unsigned char), (void*)pixel_array, 0, NULL, NULL);
+		}

+		flipped = !flipped;

 		texture.update(pixel_array);
 		window.draw(sprite);
@@ -257,21 +270,24 @@ int main(int argc, char* argv[])
 	}


-	// Temporary
+	
+	// Release the buffers
 	status = clReleaseMemObject(frontBuffer);
+	status = clReleaseMemObject(backBuffer);
+	status = clReleaseMemObject(pixelBuffer);
 	status = clReleaseMemObject(workerCountBuffer);
 	status = clReleaseMemObject(gridWidthBuffer);
 	status = clReleaseMemObject(gridHeightBuffer);

-	/*Step 12: Clean the resources.*/
-	status = clReleaseKernel(kernel);				//Release kernel.
+	// And the program stuff
+	status = clReleaseKernel(front_kernel);				//Release kernel.
 	status = clReleaseProgram(program);				//Release the program object.
 	status = clReleaseCommandQueue(commandQueue);	//Release  Command queue.
 	status = clReleaseContext(context);				//Release context.

 	if (devices != NULL)
 	{
-		free(devices);
+		delete devices;
 		devices = NULL;
 	}

--- a/Conway_OpenCL/Conway_OpenCL.vcxproj
+++ b/Conway_OpenCL/Conway_OpenCL.vcxproj
@@ -227,7 +227,8 @@
    <ClCompile Include="Conway.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <None Include="conway_kernel.cl" />
+    <None Include="conway_compute.cl" />
+    <None Include="conway_align.cl" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/Conway_OpenCL/conway_align.cl
+++ b/Conway_OpenCL/conway_align.cl
@@ -0,0 +1,7 @@
+__kernel void conway(__global unsigned char* front_grid, __global unsigned char* rear_grid, __global unsigned char* pixel_out, __global int* num_workers, __global int* grid_width, __global int* grid_height){
+
+
+
+
+
+}
--- a/Conway_OpenCL/conway_compute.cl
+++ b/Conway_OpenCL/conway_compute.cl
@@ -0,0 +1,61 @@
+__kernel void conway(__global unsigned char* front_grid, __global unsigned char* rear_grid, __global unsigned char* pixel_out, __global int* num_workers, __global int* grid_width, __global int* grid_height)
+{
+	// Caclulate the start and end range that this worker will be calculating
+
+	int data_length = *grid_width * *grid_height;
+
+	int start_range = (data_length / *num_workers) * get_global_id(0);
+	int end_range = (data_length / *num_workers) * (get_global_id(0) + 1);
+	
+	// x, y + 1
+
+	int neighbors = 0;
+
+	for (int i = start_range; i < end_range; i++){
+		
+		// add all 8 blocks to neighbors
+		neighbors = 0;
+
+		// Top
+		neighbors += front_grid[i - *grid_width];
+
+		// Top right
+		neighbors += front_grid[i - *grid_width + 1];
+
+		// Right
+		neighbors += front_grid[i + 1];
+
+		// Bottom Right
+		neighbors += front_grid[i + *grid_width + 1];
+
+		// Bottom
+		neighbors += front_grid[i + *grid_width];
+
+		// Bottom Left
+		neighbors += front_grid[i + *grid_width - 1];
+
+		// Left
+		neighbors += front_grid[i - 1];
+
+		// Top left
+		neighbors += front_grid[i - *grid_width - 1];
+
+
+		if (neighbors == 3 || (neighbors == 2 && front_grid[i])) {
+			rear_grid[i] = 1;
+			pixel_out[i * 4] = 255; // R
+			pixel_out[i * 4 + 1] = 255; // G
+			pixel_out[i * 4 + 2] = 255; // B
+			pixel_out[i * 4 + 3] = 255; // A
+		}
+	
+		else {
+			rear_grid[i] = 0;
+			pixel_out[i * 4] = 49; // R
+			pixel_out[i * 4 + 1] = 68; // G
+			pixel_out[i * 4 + 2] = 72; // B
+			pixel_out[i * 4 + 3] = 255; // A
+		}
+
+	}
+}
--- a/Conway_OpenCL/conway_kernel.cl
+++ b/Conway_OpenCL/conway_kernel.cl
@@ -1,51 +0,0 @@
-__kernel void conway(__global unsigned char* front_grid, __global unsigned char* rear_grid, __global int* num_workers, __global int* grid_width, __global int* grid_height)
-{
-	// Caclulate the start and end range that this worker will be calculating
-
-	int data_length = *grid_width * *grid_height;
-
-	int start_range = (data_length / *num_workers) * get_global_id(0) * 2; // * 2 = padding
-	int end_range = (data_length / *num_workers) * (get_global_id(0) + 1) * 2;
-	
-	// x, y + 1
-
-	int neighbors = 0;
-
-	for (int i = start_range; i < end_range; i += 2){
-		
-		// add all 8 blocks to neghbors
-		neighbors = 0;
-		// Top
-		neighbors += in[i - *grid_width * 2];
-
-		// Top right
-		neighbors += in[i - *grid_width * 2 + 2];
-
-		// Right
-		neighbors += in[i + 2];
-
-		// Bottom Right
-		neighbors += in[i + *grid_width * 2 + 2];
-
-		// Bottom
-		neighbors += in[i + *grid_width * 2];
-
-		// Bottom Left
-		neighbors += in[i + *grid_width * 2 - 2];
-
-		// Left
-		neighbors += in[i - 2];
-
-		// Top left
-		neighbors += in[i - *grid_width * 2 - 2];
-
-		// push living status to the padded second char
-
-		if (neighbors == 3 || (neighbors == 2 && in[i])){
-			in[i + 1] = 1;
-		}
-	
-		else
-			in[i + 1] = 0;
-	}
-}
--- a/Conway_OpenCL/~AutoRecover.Conway_OpenCL.vcxproj
+++ b/Conway_OpenCL/~AutoRecover.Conway_OpenCL.vcxproj
@@ -227,7 +227,8 @@
    <ClCompile Include="Conway.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <None Include="conway_kernel.cl" />
+    <None Include="conway_compute.cl" />
+    <None Include="conway_align.cl" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">