Прошу прощения за простыню ниже, что поделать, раз тут не придумали код прятать под кат или в поля со скроллом. Загвоздка в том, что у меня есть небольшой массив четверок сингл флоатов (~ 10 млн.) Мне надо найти в нём сумму всех четверок, а также bounding box. Я не знаю, стоит ли для этого привлекать GPU. У меня зашибенная платформа: CPU AMD Ryzen 1600x и GPU AMD Radeon HD 6950. Вот я и не могу понять, то ли дело в старой видюхе, то ли юзкейс совсем не тот. Поэтому прошу собрать и запустить код ниже. Программа для GPU должна называеться «test.cl» (ну вы видите, там это захардкодено).
Собирается как-то так:
cc -I/usr/local/include -L/usr/local/lib -lOpenCL -o test test.c
Прошу привести вывод того, что напечатает. У меня вот скорость вычислений одного порядка на CPU и GPU, но время передачи данных велико и убивает весь профит. Ещё почему-то, если 2 раза подсчитать одно и то же, то во второй раз на GPU скорость растет на порядок. Только смысл в этом?
С НГ!
struct set_specs {
float4 sum;
float4 min;
float4 max;
};
__kernel void pass1 (__global float4 *input,
__global struct set_specs *specs,
__local struct set_specs *tmp,
unsigned long n)
{
size_t nitems = get_global_size (0);
size_t loc_id = get_local_id (0);
size_t glob_id = get_global_id (0);
size_t group_id = get_group_id (0);
float4 sum_item = (0, 0, 0, 0);
float4 min_item = sum_item;
float4 max_item = sum_item;
size_t i, offset, mask;
for (i = glob_id; i < n; i += nitems) {
sum_item += input[i];
min_item = min (min_item, input[i]);
max_item = max (max_item, input[i]);
}
tmp[loc_id].sum = sum_item;
tmp[loc_id].min = min_item;
tmp[loc_id].max = max_item;
barrier (CLK_LOCAL_MEM_FENCE);
for (offset = 1; offset < get_local_size (0); offset <<= 1) {
mask = (offset << 1) - 1;
if ((loc_id & mask) == 0) {
tmp[loc_id].sum = tmp[loc_id + offset].sum + tmp[loc_id].sum;
tmp[loc_id].min = min (tmp[loc_id + offset].min, tmp[loc_id].min);
tmp[loc_id].max = max (tmp[loc_id + offset].max, tmp[loc_id].max);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (loc_id == 0) {
specs[group_id].sum = tmp[0].sum;
specs[group_id].min = tmp[0].min;
specs[group_id].max = tmp[0].max;
}
}
__kernel void pass2 (__global struct set_specs *input,
__global struct set_specs *specs,
__local float4 *sum4,
__local float4 *min4,
__local float4 *max4)
{
size_t id = get_local_id (0);
sum4[id] = input[id].sum;
min4[id] = input[id].min;
max4[id] = input[id].max;
barrier(CLK_LOCAL_MEM_FENCE);
size_t offset, mask;
for (offset = 1; offset < get_local_size (0); offset <<= 1) {
mask = (offset << 1) - 1;
if ((id & mask) == 0) {
sum4[id] = sum4[id] + sum4[id + offset];
min4[id] = min (min4[id], min4[id + offset]);
max4[id] = max (max4[id], max4[id + offset]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (id == 0) {
specs->sum = sum4[0];
specs->min = min4[0];
specs->max = max4[0];
}
}
#include <stdlib.h>
#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/mman.h>
#include <xmmintrin.h>
#include "CL/cl.h"
struct set_specs {
cl_float4 sum;
cl_float4 min;
cl_float4 max;
};
struct sse_specs {
float sum[4];
float min[4];
float max[4];
};
static struct {
cl_context context;
cl_kernel pass1, pass2;
cl_program program;
cl_command_queue queue;
cl_mem output;
size_t group_size;
void *program_mapping;
size_t program_size;
} opencl_context;
static void free_context ()
{
if (opencl_context.output != NULL) clReleaseMemObject (opencl_context.output);
if (opencl_context.program != NULL) clReleaseProgram (opencl_context.program);
if (opencl_context.pass1 != NULL) clReleaseKernel (opencl_context.pass1);
if (opencl_context.pass2 != NULL) clReleaseKernel (opencl_context.pass2);
if (opencl_context.queue != NULL) clReleaseCommandQueue(opencl_context.queue);
if (opencl_context.context != NULL) clReleaseContext(opencl_context.context);
if (opencl_context.program_mapping != NULL)
munmap (opencl_context.program_mapping, opencl_context.program_size);
}
static int init ()
{
cl_context_properties properties[3];
cl_uint num_of_platforms=0;
cl_platform_id platform_id;
cl_device_id device_id;
cl_uint num_of_devices=0;
// retreives a list of platforms available
if (clGetPlatformIDs (1, &platform_id, &num_of_platforms)!= CL_SUCCESS) {
fprintf(stderr, "Unable to get platform_id\n");
goto bad;
}
// try to get a supported GPU device
if (clGetDeviceIDs (platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id,
&num_of_devices) != CL_SUCCESS) {
fprintf(stderr, "Unable to get device_id\n");
goto bad;
}
// context properties list - must be terminated with 0
properties[0]= CL_CONTEXT_PLATFORM;
properties[1]= (cl_context_properties) platform_id;
properties[2]= 0;
int fd = open ("test.cl", O_RDONLY);
if (fd == -1) {
fprintf (stderr, "Cannot open OpenCL program\n");
goto bad;
}
struct stat sb;
fstat (fd, &sb);
opencl_context.program_size = sb.st_size;
printf ("Program size %lu bytes\n", opencl_context.program_size);
opencl_context.program_mapping = mmap (NULL, opencl_context.program_size, PROT_READ, MAP_PRIVATE, fd, 0);
close (fd);
opencl_context.context = clCreateContext (properties, 1, &device_id, NULL, NULL, NULL);
if (opencl_context.context == NULL) {
fprintf (stderr, "Cannot create context\n");
goto bad;
}
opencl_context.queue = clCreateCommandQueue (opencl_context.context, device_id, 0, NULL);
if (opencl_context.queue == NULL) {
fprintf (stderr, "Cannot create command queue\n");
goto bad;
}
opencl_context.program = clCreateProgramWithSource (opencl_context.context, 1, (const char **)
&opencl_context.program_mapping, NULL, NULL);
if (opencl_context.program == NULL) {
fprintf (stderr, "Cannot create program\n");
goto bad;
}
if (clBuildProgram (opencl_context.program, 0, NULL, "-cl-fast-relaxed-math -cl-unsafe-math-optimizations",
NULL, NULL) != CL_SUCCESS) {
fprintf(stderr, "Error building program\n");
char buffer[4096];
size_t length;
clGetProgramBuildInfo(opencl_context.program, device_id, CL_PROGRAM_BUILD_LOG,
sizeof(buffer), buffer, &length);
fprintf(stderr, "%s\n",buffer);
free_context ();
exit (1);
}
// specify which kernel from the program to execute
opencl_context.pass1 = clCreateKernel (opencl_context.program, "pass1", NULL);
if (opencl_context.pass1 == NULL) {
fprintf (stderr, "Cannot create kernel\n");
goto bad;
}
if (clGetKernelWorkGroupInfo (opencl_context.pass1, device_id,
CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof (size_t),
&opencl_context.group_size, NULL) != CL_SUCCESS) {
fprintf(stderr, "Error getting group size program\n");
goto bad;
}
printf ("Group size = %lu\n", opencl_context.group_size);
opencl_context.pass2 = clCreateKernel (opencl_context.program, "pass2", NULL);
if (opencl_context.pass2 == NULL) {
fprintf (stderr, "Cannot create kernel\n");
goto bad;
}
// create buffers for the input and ouput
opencl_context.output = clCreateBuffer(opencl_context.context, CL_MEM_READ_WRITE,
sizeof (struct set_specs) * opencl_context.group_size, NULL, NULL);
return 1;
bad:
free_context ();
return 0;
}
static void find_specs (cl_mem input, size_t n, struct set_specs *out)
{
size_t local_size = opencl_context.group_size;
size_t global_size = local_size * local_size;
cl_ulong len_arg;
len_arg = n;
clSetKernelArg(opencl_context.pass1, 0, sizeof(cl_mem), &input);
clSetKernelArg(opencl_context.pass1, 1, sizeof(cl_mem), &opencl_context.output);
clSetKernelArg(opencl_context.pass1, 2, local_size * sizeof(struct set_specs), NULL);
clSetKernelArg(opencl_context.pass1, 3, sizeof(cl_ulong), &len_arg);
clEnqueueNDRangeKernel(opencl_context.queue, opencl_context.pass1, 1, NULL, &global_size,
&local_size, 0, NULL, NULL);
clSetKernelArg(opencl_context.pass2, 0, sizeof(cl_mem), &opencl_context.output);
clSetKernelArg(opencl_context.pass2, 1, sizeof(cl_mem), &opencl_context.output);
clSetKernelArg(opencl_context.pass2, 2, sizeof(cl_float4) * local_size, NULL);
clSetKernelArg(opencl_context.pass2, 3, sizeof(cl_float4) * local_size, NULL);
clSetKernelArg(opencl_context.pass2, 4, sizeof(cl_float4) * local_size, NULL);
clEnqueueNDRangeKernel(opencl_context.queue, opencl_context.pass2, 1, NULL, &local_size,
&local_size, 0, NULL, NULL);
clEnqueueReadBuffer(opencl_context.queue, opencl_context.output, CL_TRUE, 0,
sizeof(struct set_specs), out, 0, NULL, NULL);
}
#define N 11000000
typedef float vect4[4] __attribute__((aligned (16)));
static void traditional_specs (const vect4 *dots, size_t n, struct sse_specs *specs)
{
__v4sf sum = _mm_set_ps1 (0);
__v4sf min = sum;
__v4sf max = sum;
__v4sf dot;
size_t i;
for (i=0; i<n; i++) {
dot = _mm_load_ps (dots[i]);
sum += dot;
min = _mm_min_ps (min, dot);
max = _mm_max_ps (max, dot);
}
_mm_store_ps (specs->sum, sum);
_mm_store_ps (specs->min, min);
_mm_store_ps (specs->max, max);
}
static long gettime()
{
struct timespec tv;
clock_gettime (CLOCK_REALTIME, &tv);
return tv.tv_sec * 1000000000L + tv.tv_nsec;
}
int main ()
{
if (!init ()) return 1;
cl_mem input;
int i;
long time;
input = clCreateBuffer(opencl_context.context, CL_MEM_READ_WRITE,
sizeof (cl_float4) * N, NULL, NULL);
cl_float4 *buffer = clEnqueueMapBuffer (opencl_context.queue, input, CL_TRUE,
CL_MAP_WRITE, 0, sizeof (cl_float4) * N,
0, NULL, NULL, NULL);
time = gettime();
for (i=0; i<N; i++) {
buffer[i].x = i;
buffer[i].y = i;
buffer[i].z = i;
buffer[i].w = i;
}
time = gettime() - time;
clEnqueueUnmapMemObject (opencl_context.queue, input, buffer, 0, NULL, NULL);
printf ("Data loading time = %li\n", time);
struct set_specs output;
time = gettime();
find_specs (input, N, &output);
time = gettime() - time;
clReleaseMemObject (input);
printf ("sum = <%f, %f, %f, %f>, time=%li\n", output.sum.x, output.sum.y,
output.sum.z, output.sum.w, time);
printf ("min = <%f, %f, %f, %f>, time=%li\n", output.min.x, output.min.y,
output.min.z, output.min.w, time);
printf ("max = <%f, %f, %f, %f>, time=%li\n", output.max.x, output.max.y,
output.max.z, output.max.w, time);
vect4 *buffer2 = aligned_alloc (16, sizeof (vect4) * N);
struct sse_specs output2 __attribute__((aligned (16)));
for (i=0; i<N; i++) {
buffer2[i][0] = i;
buffer2[i][1] = i;
buffer2[i][2] = i;
buffer2[i][3] = i;
}
time = gettime();
traditional_specs (buffer2, N, &output2);
time = gettime() - time;
printf ("sum = <%f, %f, %f, %f>, time=%li\n", output2.sum[0], output2.sum[1],
output2.sum[2], output2.sum[3], time);
printf ("min = <%f, %f, %f, %f>, time=%li\n", output2.min[0], output2.min[1],
output2.min[2], output2.min[3], time);
printf ("max = <%f, %f, %f, %f>, time=%li\n", output2.max[0], output2.max[1],
output2.max[2], output2.max[3], time);
free_context ();
free (buffer2);
return 0;