#include #include #include #include #define ARRAY_SIZE 15000576 // Tiny array #define LOCAL_SIZE 256 // Small work-group size const char *kernelSource = "__kernel void add(__global const float *a, __global const float *b, __global float *c) { " " int i = get_global_id(0); " " if (i < 15000576) { " " c[i] = a[i] + b[i]; " "} " "} "; double get_time() { struct timeval tv; gettimeofday(&tv, NULL); return tv.tv_sec + tv.tv_usec * 1e-6; } int main() { float *a = (float*)malloc(sizeof(float) * ARRAY_SIZE); float *b = (float*)malloc(sizeof(float) * ARRAY_SIZE); float *c = (float*)malloc(sizeof(float) * ARRAY_SIZE); // Initialize arrays for (int i = 0; i < ARRAY_SIZE; i++) { a[i] = i; b[i] = i * 2; } // Get GPU platform and device cl_platform_id platform; cl_device_id gpuDevice; cl_uint platformCount, deviceCount; clGetPlatformIDs(1, &platform, &platformCount); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &gpuDevice, &deviceCount); // Print GPU device info char deviceName[128]; clGetDeviceInfo(gpuDevice, CL_DEVICE_NAME, 128, deviceName, NULL); printf("Using GPU device: %s\n", deviceName); // Create context and command queue cl_context context = clCreateContext(NULL, 1, &gpuDevice, NULL, NULL, NULL); cl_command_queue command_queue = clCreateCommandQueue(context, gpuDevice, 0, NULL); // Create buffers double start = get_time(); cl_mem a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), (void *)a, NULL); cl_mem b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), (void *)b, NULL); cl_mem c_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_SIZE * sizeof(float), NULL, NULL); double buffer_create_time = get_time() - start; // Time data transfer to device (not needed with CL_MEM_COPY_HOST_PTR) double transfer_to_device_time = 0.0; // Create and build program start = get_time(); cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, NULL); cl_int ret = clBuildProgram(program, 1, &gpuDevice, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(program, gpuDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char *log = (char *)malloc(log_size); clGetProgramBuildInfo(program, gpuDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); printf("Kernel compilation error:\n%s\n", log); free(log); return 1; } double program_build_time = get_time() - start; // Create kernel start = get_time(); cl_kernel kernel = clCreateKernel(program, "add", NULL); if (!kernel) { printf("Error creating kernel\n"); return 1; } double kernel_create_time = get_time() - start; // Set kernel arguments start = get_time(); ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem); if (ret != CL_SUCCESS) { printf("Error setting kernel arguments: %d\n", ret); return 1; } double kernel_arg_time = get_time() - start; // Execute kernel start = get_time(); size_t global_size = ARRAY_SIZE; size_t local_size = LOCAL_SIZE; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("Error executing kernel: %d\n", ret); return 1; } clFinish(command_queue); double kernel_exec_time = get_time() - start; // Read result back start = get_time(); ret = clEnqueueReadBuffer(command_queue, c_mem, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), c, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("Error reading from memory buffer: %d\n", ret); return 1; } double transfer_to_host_time = get_time() - start; // Print timing results printf("Buffer creation time: %.6f ms\n", buffer_create_time * 1000); printf("Program build time: %.6f ms\n", program_build_time * 1000); printf("Kernel creation time: %.6f ms\n", kernel_create_time * 1000); printf("Kernel argument setup time: %.6f ms\n", kernel_arg_time * 1000); printf("Kernel execution time: %.6f ms\n", kernel_exec_time * 1000); printf("Data transfer to host: %.6f ms\n", transfer_to_host_time * 1000); printf("Total time: %.6f ms\n", (buffer_create_time + program_build_time + kernel_create_time + kernel_arg_time + kernel_exec_time + transfer_to_host_time) * 1000); // Print the result for (int i = 0; i < 8; i++) { printf("c[%d] = %f\n", i, c[i]); } // Cleanup clReleaseMemObject(a_mem); clReleaseMemObject(b_mem); clReleaseMemObject(c_mem); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); return 0; }