この項目で紹介したソースコードは、バンクコンフリクトの最悪のシナリオのガードにはなりますが、パフォーマンス的には必ずしも早くなることを意味しません。また4KBでアラインしたデータサイズが崩れるため、メモリスピルが発生する可能性もありますので、注意しながら確認してみてください。
前の項目で紹介した実装例に対してバンクコンフリクトの緩和ができるように、調整したソースコードです。
HistogramBankStride4kbslmAdjusted.py.
import pyopencl as cl import numpy as np from numpy.random import * import pyopencl.tools # Set the seed to 100 np.random.seed(100) BIN_SIZE = 256 LOCAL_SIZE = 4 WG_SIZE = 16 DATA_SIZE = BIN_SIZE * LOCAL_SIZE * WG_SIZE data = randint(0, 8, DATA_SIZE).astype(np.uint32) out = np.zeros(LOCAL_SIZE*BIN_SIZE).astype(np.uint32) out.reshape((LOCAL_SIZE, BIN_SIZE)) devices = [cl.get_platforms()[0].get_devices(cl.device_type.GPU)[0]] ctx = cl.Context(devices) queue = cl.CommandQueue(ctx) mf = cl.mem_flags data_mem = cl.Buffer(ctx, mf.USE_HOST_PTR, hostbuf=data) out_mem = cl.Buffer(ctx, mf.WRITE_ONLY, out.nbytes) program = cl.Program(ctx, """ #define BIN_SIZE 256 //4KB = 4 threads //16 WG __kernel void histogram( __global uint4* data, __local uint* shared_local_memory, __global uint* buckets) { size_t lid = get_local_id(0); size_t group_id = get_group_id(0); size_t local_size = get_local_size(0); int offset = lid % 2; // 0,1 int stride = 2 * offset; // 0,2 int bank_number = lid >> 1; // 0,1 __local uchar4* input = (__local uchar4*) shared_local_memory; for(int i = 0; i < BIN_SIZE/4; ++i) input[(local_size+1) * i + lid] = 0; barrier(CLK_LOCAL_MEM_FENCE); for(int i = 0; i < BIN_SIZE /4; ++i) { uint4 value = data[ group_id * local_size * (BIN_SIZE / 4) + i * local_size + lid]; // (0...15) * 4 * 256 / 4 + (0..63) * 4 + lid shared_local_memory[value.s0 * (local_size+1) + stride + bank_number]++; // value.s0, s1, s2, s3 is random, may update same position shared_local_memory[value.s1 * (local_size+1) + stride + bank_number]++; shared_local_memory[value.s2 * (local_size+1) + stride + bank_number]++; shared_local_memory[value.s3 * (local_size+1) + stride + bank_number]++; } barrier(CLK_LOCAL_MEM_FENCE); if(lid == 0) { // each work-group only has 1 work-item executing this code block for(size_t i = 0; i < BIN_SIZE; i++) { int result = 0; for(size_t j = 0; j < local_size; j++) { result += shared_local_memory[i * (local_size+1) + j]; // [0..255]*4 + 0...3 => min 0, max 1023 } buckets[group_id * BIN_SIZE + i] = result; // 0..15 * 256 + 0...255 } } } """).build() kernel = cl.Kernel(program, name="histogram") kernel.set_arg(0, data_mem) kernel.set_arg(1, cl.LocalMemory(4*BIN_SIZE*(LOCAL_SIZE+1))) kernel.set_arg(2, out_mem) cl.enqueue_nd_range_kernel( queue=queue, kernel=kernel, global_work_size=(WG_SIZE * LOCAL_SIZE, ), local_work_size=(LOCAL_SIZE, )) out = cl.enqueue_map_buffer( queue=queue, buf=out_mem, flags=mf.WRITE_ONLY, offset=0, shape=(LOCAL_SIZE, BIN_SIZE,), dtype=np.uint32 ) print(out[0][0][0:256]) print(out[0])
Copyright 2018-2019, by Masaki Komatsu