143 lines
5.3 KiB
C
143 lines
5.3 KiB
C
|
|
#include <stdio.h>
|
||
|
|
#include <stdlib.h>
|
||
|
|
#include <CL/cl.h>
|
||
|
|
#include <sys/time.h>
|
||
|
|
|
||
|
|
#define ARRAY_SIZE 15000576 // Tiny array
|
||
|
|
#define LOCAL_SIZE 256 // Small work-group size
|
||
|
|
|
||
|
|
const char *kernelSource =
|
||
|
|
"__kernel void add(__global const float *a, __global const float *b, __global float *c) { "
|
||
|
|
" int i = get_global_id(0); "
|
||
|
|
" if (i < 15000576) { "
|
||
|
|
" c[i] = a[i] + b[i]; "
|
||
|
|
"} "
|
||
|
|
"} ";
|
||
|
|
|
||
|
|
double get_time() {
|
||
|
|
struct timeval tv;
|
||
|
|
gettimeofday(&tv, NULL);
|
||
|
|
return tv.tv_sec + tv.tv_usec * 1e-6;
|
||
|
|
}
|
||
|
|
|
||
|
|
int main() {
|
||
|
|
float *a = (float*)malloc(sizeof(float) * ARRAY_SIZE);
|
||
|
|
float *b = (float*)malloc(sizeof(float) * ARRAY_SIZE);
|
||
|
|
float *c = (float*)malloc(sizeof(float) * ARRAY_SIZE);
|
||
|
|
|
||
|
|
// Initialize arrays
|
||
|
|
for (int i = 0; i < ARRAY_SIZE; i++) {
|
||
|
|
a[i] = i;
|
||
|
|
b[i] = i * 2;
|
||
|
|
}
|
||
|
|
|
||
|
|
// Get GPU platform and device
|
||
|
|
cl_platform_id platform;
|
||
|
|
cl_device_id gpuDevice;
|
||
|
|
cl_uint platformCount, deviceCount;
|
||
|
|
clGetPlatformIDs(1, &platform, &platformCount);
|
||
|
|
clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &gpuDevice, &deviceCount);
|
||
|
|
|
||
|
|
// Print GPU device info
|
||
|
|
char deviceName[128];
|
||
|
|
clGetDeviceInfo(gpuDevice, CL_DEVICE_NAME, 128, deviceName, NULL);
|
||
|
|
printf("Using GPU device: %s\n", deviceName);
|
||
|
|
|
||
|
|
// Create context and command queue
|
||
|
|
cl_context context = clCreateContext(NULL, 1, &gpuDevice, NULL, NULL, NULL);
|
||
|
|
cl_command_queue command_queue = clCreateCommandQueue(context, gpuDevice, 0, NULL);
|
||
|
|
|
||
|
|
// Create buffers
|
||
|
|
double start = get_time();
|
||
|
|
cl_mem a_mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), (void *)a, NULL);
|
||
|
|
cl_mem b_mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ARRAY_SIZE * sizeof(float), (void *)b, NULL);
|
||
|
|
cl_mem c_mem = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_SIZE * sizeof(float), NULL, NULL);
|
||
|
|
double buffer_create_time = get_time() - start;
|
||
|
|
|
||
|
|
// Time data transfer to device (not needed with CL_MEM_COPY_HOST_PTR)
|
||
|
|
double transfer_to_device_time = 0.0;
|
||
|
|
|
||
|
|
// Create and build program
|
||
|
|
start = get_time();
|
||
|
|
cl_program program = clCreateProgramWithSource(context, 1, &kernelSource, NULL, NULL);
|
||
|
|
cl_int ret = clBuildProgram(program, 1, &gpuDevice, NULL, NULL, NULL);
|
||
|
|
if (ret != CL_SUCCESS) {
|
||
|
|
size_t log_size;
|
||
|
|
clGetProgramBuildInfo(program, gpuDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||
|
|
char *log = (char *)malloc(log_size);
|
||
|
|
clGetProgramBuildInfo(program, gpuDevice, CL_PROGRAM_BUILD_LOG, log_size, log, NULL);
|
||
|
|
printf("Kernel compilation error:\n%s\n", log);
|
||
|
|
free(log);
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
double program_build_time = get_time() - start;
|
||
|
|
|
||
|
|
// Create kernel
|
||
|
|
start = get_time();
|
||
|
|
cl_kernel kernel = clCreateKernel(program, "add", NULL);
|
||
|
|
if (!kernel) {
|
||
|
|
printf("Error creating kernel\n");
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
double kernel_create_time = get_time() - start;
|
||
|
|
|
||
|
|
// Set kernel arguments
|
||
|
|
start = get_time();
|
||
|
|
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem);
|
||
|
|
ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem);
|
||
|
|
ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem);
|
||
|
|
if (ret != CL_SUCCESS) {
|
||
|
|
printf("Error setting kernel arguments: %d\n", ret);
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
double kernel_arg_time = get_time() - start;
|
||
|
|
|
||
|
|
// Execute kernel
|
||
|
|
start = get_time();
|
||
|
|
size_t global_size = ARRAY_SIZE;
|
||
|
|
size_t local_size = LOCAL_SIZE;
|
||
|
|
ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL);
|
||
|
|
if (ret != CL_SUCCESS) {
|
||
|
|
printf("Error executing kernel: %d\n", ret);
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
clFinish(command_queue);
|
||
|
|
double kernel_exec_time = get_time() - start;
|
||
|
|
|
||
|
|
// Read result back
|
||
|
|
start = get_time();
|
||
|
|
ret = clEnqueueReadBuffer(command_queue, c_mem, CL_TRUE, 0, ARRAY_SIZE * sizeof(float), c, 0, NULL, NULL);
|
||
|
|
if (ret != CL_SUCCESS) {
|
||
|
|
printf("Error reading from memory buffer: %d\n", ret);
|
||
|
|
return 1;
|
||
|
|
}
|
||
|
|
double transfer_to_host_time = get_time() - start;
|
||
|
|
|
||
|
|
// Print timing results
|
||
|
|
printf("Buffer creation time: %.6f ms\n", buffer_create_time * 1000);
|
||
|
|
printf("Program build time: %.6f ms\n", program_build_time * 1000);
|
||
|
|
printf("Kernel creation time: %.6f ms\n", kernel_create_time * 1000);
|
||
|
|
printf("Kernel argument setup time: %.6f ms\n", kernel_arg_time * 1000);
|
||
|
|
printf("Kernel execution time: %.6f ms\n", kernel_exec_time * 1000);
|
||
|
|
printf("Data transfer to host: %.6f ms\n", transfer_to_host_time * 1000);
|
||
|
|
printf("Total time: %.6f ms\n",
|
||
|
|
(buffer_create_time + program_build_time + kernel_create_time +
|
||
|
|
kernel_arg_time + kernel_exec_time + transfer_to_host_time) * 1000);
|
||
|
|
|
||
|
|
// Print the result
|
||
|
|
for (int i = 0; i < 8; i++) {
|
||
|
|
printf("c[%d] = %f\n", i, c[i]);
|
||
|
|
}
|
||
|
|
|
||
|
|
// Cleanup
|
||
|
|
clReleaseMemObject(a_mem);
|
||
|
|
clReleaseMemObject(b_mem);
|
||
|
|
clReleaseMemObject(c_mem);
|
||
|
|
clReleaseKernel(kernel);
|
||
|
|
clReleaseProgram(program);
|
||
|
|
clReleaseCommandQueue(command_queue);
|
||
|
|
clReleaseContext(context);
|
||
|
|
|
||
|
|
return 0;
|
||
|
|
}
|