#include #include #include #include #include #define ARRAY_SIZE 15000576 // 100 million items #define LOCAL_SIZE 256 // Tunable work-group size const char *kernelSource = "__kernel void add(__global const float *a, __global const float *b, __global float *c) { " " int i = get_global_id(0); " " c[i] = a[i] + b[i]; " "} "; double get_time() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + tv.tv_usec * 1e-6; } int main() { float *a = (float*)malloc(sizeof(float) * ARRAY_SIZE); float *b = (float*)malloc(sizeof(float) * ARRAY_SIZE); float *c = (float*)malloc(sizeof(float) * ARRAY_SIZE); // Initialize arrays for (int i = 0; i < ARRAY_SIZE; i++) { a[i] = i; b[i] = i * 2; } // Get all platforms cl_uint platformCount; clGetPlatformIDs(0, NULL, &platformCount); cl_platform_id *platforms = (cl_platform_id *)malloc(platformCount * sizeof(cl_platform_id)); clGetPlatformIDs(platformCount, platforms, NULL); // Find the POCL platform cl_platform_id poclPlatform = NULL; for (cl_uint i = 0; i < platformCount; i++) { char platformName[128]; clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, platformName, NULL); if (strstr(platformName, "Portable Computing Language") != NULL) { poclPlatform = platforms[i]; break; } } if (poclPlatform == NULL) { printf("POCL platform not found!\n"); return 1; } // Get the CPU device from the POCL platform cl_device_id cpuDevice = NULL; cl_uint deviceCount; cl_int ret = clGetDeviceIDs(poclPlatform, CL_DEVICE_TYPE_CPU, 1, &cpuDevice, &deviceCount); if (ret != CL_SUCCESS || deviceCount == 0) { printf("No CPU device found on POCL platform!\n"); return 1; } // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, &cpuDevice, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("Error creating context: %d\n", ret); return 1; } // Create a command queue cl_command_queue command_queue = clCreateCommandQueueWithProperties(context, cpuDevice, NULL, &ret); if (ret != CL_SUCCESS) { printf("Error creating command queue: %d\n", ret); return 1; } // Create memory buffers on the device cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_SIZE * sizeof(float), NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_SIZE * sizeof(float), NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_SIZE * sizeof(float), NULL, &ret); if (ret != CL_SUCCESS) { printf("Error creating memory buffers: %d\n", ret); return 1; } // Time data transfer to device double start = get_time(); ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), a, 0, NULL, NULL); ret |= clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), b, 0, NULL, NULL); double transfer_to_device_time = get_time() - start; if (ret != CL_SUCCESS) { printf("Error writing to memory buffers: %d\n", ret); return 1; } // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &ret); if (ret != CL_SUCCESS) { printf("Error creating program: %d\n", ret); return 1; } // Build the program ret = clBuildProgram(program, 1, &cpuDevice, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(program, cpuDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char *log = (char *)malloc(log_size); clGetProgramBuildInfo(program, cpuDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); printf("Kernel compilation error:\n%s\n", log); free(log); return 1; } // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "add", &ret); if (ret != CL_SUCCESS) { printf("Error creating kernel: %d\n", ret); return 1; } // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); if (ret != CL_SUCCESS) { printf("Error setting kernel arguments: %d\n", ret); return 1; } // Time kernel execution start = get_time(); size_t global_item_size = ARRAY_SIZE; size_t local_item_size = LOCAL_SIZE; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); clFinish(command_queue); double kernel_time = get_time() - start; if (ret != CL_SUCCESS) { printf("Error executing kernel: %d\n", ret); return 1; } // Time data transfer back to host start = get_time(); ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), c, 0, NULL, NULL); double transfer_to_host_time = get_time() - start; if (ret != CL_SUCCESS) { printf("Error reading from memory buffer: %d\n", ret); return 1; } // Print timing results printf("Data transfer to device: %.3f ms\n", transfer_to_device_time * 1000); printf("Kernel execution time: %.3f ms\n", kernel_time * 1000); printf("Data transfer to host: %.3f ms\n", transfer_to_host_time * 1000); printf("Total time: %.3f ms\n", (transfer_to_device_time + kernel_time + transfer_to_host_time) * 1000); // Print a sample of the result for (int i = 0; i < 10; i++) { printf("c[%d] = %f\n", i, c[i]); } // Clean up clFlush(command_queue); clFinish(command_queue); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseMemObject(a_mem_obj); clReleaseMemObject(b_mem_obj); clReleaseMemObject(c_mem_obj); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(a); free(b); free(c); free(platforms); return 0; }