Изучаю OpenCL, для теста производительности использовал два варианта кода:
С оптимизацией:
import pyopencl as cl
import numpy
import sys
class CL(object):
def __init__(self, size=10):
self.size = size
self.ctx = cl.create_some_context()
self.queue = cl.CommandQueue(self.ctx)
def load_program(self):
fstr="""
__kernel void part1(__global float* a, __global float* b, __global float* c)
{
unsigned int i = get_global_id(0);
c[i] = a[i] + b[i];
}
"""
self.program = cl.Program(self.ctx, fstr).build()
def popCorn(self):
mf = cl.mem_flags
self.a = numpy.array(range(self.size), dtype=numpy.float128)
self.b = numpy.array(range(self.size), dtype=numpy.float128)
self.a_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,
hostbuf=self.a)
self.b_buf = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR,
hostbuf=self.b)
self.dest_buf = cl.Buffer(self.ctx, mf.WRITE_ONLY, self.b.nbytes)
def execute(self):
self.program.part1(self.queue, self.a.shape, None, self.a_buf, self.b_buf, self.dest_buf)
c = numpy.empty_like(self.a)
cl.enqueue_read_buffer(self.queue, self.dest_buf, c).wait()
print "a", self.a
print "b", self.b
print "c", c
if __name__ == '__main__':
matrixmul = CL(10000000)
matrixmul.load_program()
matrixmul.popCorn()
matrixmul.execute()
def add(size=10):
a = tuple([float(i) for i in range(size)])
b = tuple([float(j) for j in range(size)])
c = [None for i in range(size)]
for i in range(size):
c[i] = a[i]+b[i]
#print "a", a
#print "b", b
print "c", c[:1000]
add(1000000)
С оптимизацией:
real 10.76
user 9.21
sys 1.47
real 2.30
user 2.11
sys 0.18