Stride型のアルゴリズムは前の2つの項目で一通り解説してきました。この項目では実装をしてみたいと思います。
まずヒストグラム集計のパラメータを以下のように設定します。
集計をする元データは、4*16*256=16384とします。
元データ、共有ローカルメモリ、出力データはint型とします。共有ローカルメモリ(shared_local_memory)は256x4の配列となるため、各ワークグループに割り当てられるメモリの容量は、4KBとなります。16個のワークグループで使うメモリ容量は64KBとなり、検証機となるIntelのHD Graphics 4000で使う容量最大限を使う計算となります。
HistogramBankStride4kbslm.py.
import pyopencl as cl import numpy as np from numpy.random import * # Set the seed to 100 np.random.seed(100) INT_BYTES = 4 BIN_SIZE = 256 LOCAL_SIZE = 4 WG_SIZE = 16 DATA_SIZE = BIN_SIZE * LOCAL_SIZE * WG_SIZE data = randint(0, 1, DATA_SIZE).astype(np.uint32) out = np.zeros(BIN_SIZE*LOCAL_SIZE).astype(np.uint32) out.reshape((LOCAL_SIZE, BIN_SIZE)) devices = [cl.get_platforms()[0].get_devices(cl.device_type.GPU)[0]] ctx = cl.Context(devices) queue = cl.CommandQueue(ctx) mf = cl.mem_flags data_mem = cl.Buffer(ctx, mf.USE_HOST_PTR, hostbuf=data) out_mem = cl.Buffer(ctx, mf.WRITE_ONLY, size=out.nbytes) program = cl.Program(ctx, """ #define BIN_SIZE 256 //4KB = 4 threads //16 WG __kernel void histogram( __global uint4* data, __local uint* shared_local_memory, __global uint* buckets) { size_t lid = get_local_id(0); size_t group_id = get_group_id(0); size_t local_size = get_local_size(0); int offset = lid % 2; // 0,1 int stride = 2 * offset; // 0,2 int bank_number = lid >> 1; // 0,1 __local uchar4* input = (__local uchar4*) shared_local_memory; for(int i = 0; i < BIN_SIZE/4; ++i) input[local_size * i + lid] = 0; barrier(CLK_LOCAL_MEM_FENCE); for(int i = 0; i < BIN_SIZE /4; ++i) { uint4 value = data[ group_id * local_size * (BIN_SIZE / 4) + i * local_size + lid]; // (0...15) * 4 * 256 / 4 + (0..63) * 4 + lid shared_local_memory[value.s0 * local_size + stride + bank_number]++; // value.s0, s1, s2, s3 is random, may update same position shared_local_memory[value.s1 * local_size + stride + bank_number]++; shared_local_memory[value.s2 * local_size + stride + bank_number]++; shared_local_memory[value.s3 * local_size + stride + bank_number]++; } barrier(CLK_LOCAL_MEM_FENCE); if(lid == 0) { // each work-group only has 1 work-item executing this code block for(size_t i = 0; i < BIN_SIZE; i++) { int result = 0; for(size_t j = 0; j < local_size; j++) { result += shared_local_memory[i * local_size + j]; // [0..255]*4 + 0...3 => min 0, max 1023 } buckets[group_id * BIN_SIZE + i] = result; // 0..15 * 256 + 0...255 } } } """).build() kernel = cl.Kernel(program, name="histogram") kernel.set_arg(0, data_mem) # size to be specified in bytes (ie., 32 bits -> 4 bytes) kernel.set_arg(1, cl.LocalMemory(INT_BYTES*BIN_SIZE*LOCAL_SIZE)) kernel.set_arg(2, out_mem) cl.enqueue_nd_range_kernel( queue=queue, kernel=kernel, global_work_size=(WG_SIZE * LOCAL_SIZE, 1, 1), local_work_size=(LOCAL_SIZE, 1, 1)) out = cl.enqueue_map_buffer( queue=queue, buf=out_mem, flags=mf.WRITE_ONLY, offset=0, shape=(LOCAL_SIZE, BIN_SIZE,), dtype=np.uint32 ) print(out[0][0][0:256]) print(out[0])
Copyright 2018-2019, by Masaki Komatsu