11.6. バンクコンフリクトを緩和する実装例

警告

この項目で紹介したソースコードは、バンクコンフリクトの最悪のシナリオのガードにはなりますが、パフォーマンス的には必ずしも早くなることを意味しません。また4KBでアラインしたデータサイズが崩れるため、メモリスピルが発生する可能性もありますので、注意しながら確認してみてください。

前の項目で紹介した実装例に対してバンクコンフリクトの緩和ができるように、調整したソースコードです。

バンクが衝突しないようにする対策で一番シンプルなものは、ホストAPIのメモリ空間割り当て時のサイズを一要素増やすことです。

clSetKernelArg(kernel, 1, Sizeof.cl_uint * (LOCAL_SIZE+1) * BIN_SIZE, null);

これに対応してカーネル関数にも以下のような変更を加えます。

for(int i = 0; i < BIN_SIZE/4; ++i)
    input[(local_size+1) * i + lid] = 0;
//
// Omit the intermediate steps
//
shared_local_memory[value.s0 * (local_size+1) + stride + bank_number]++

これによりバンクのアクセスが毎回一個分だけづれることになり、衝突を幾分か緩和する可能性があります。

HistogramBankStride4kbslmAdjusted.java. 

package com.book.jocl.histogram;

import static org.jocl.CL.*;

import java.io.File;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Paths;
import java.util.Random;
import java.util.Scanner;

import org.jocl.CL;
import org.jocl.Pointer;
import org.jocl.Sizeof;
import org.jocl.cl_command_queue;
import org.jocl.cl_context;
import org.jocl.cl_context_properties;
import org.jocl.cl_device_id;
import org.jocl.cl_kernel;
import org.jocl.cl_mem;
import org.jocl.cl_platform_id;
import org.jocl.cl_program;

public class HistogramBankStride4kbslmAdjusted {

        private static final String KERNEL_PATH = "histogram.cl";
        private static final String KERNEL_FUNC = "histogram";
        private static final int BIN_SIZE = 256;
        private static final int LOCAL_SIZE = 4;
        private static final int WG_SIZE = 16;
        private static final int DATA_SIZE = BIN_SIZE * LOCAL_SIZE * WG_SIZE;
        private static final int[] data = new int[DATA_SIZE];

    private static cl_context context;
    private static cl_command_queue queue;
    private static cl_program program;
    private static cl_kernel kernel;

        public static void main(String[] args) throws Exception {

                CL.setExceptionsEnabled(true);

                cl_platform_id[] platform = new cl_platform_id[1];
                cl_device_id[] device = new cl_device_id[1];
                int[] num_devices = new int[1];

                clGetPlatformIDs(1, platform, null);
                clGetDeviceIDs(platform[0], CL_DEVICE_TYPE_GPU, 1, device, num_devices);

                cl_context_properties props = new cl_context_properties();
                props.addProperty(CL_CONTEXT_PLATFORM, platform[0]);
                context = clCreateContext(props, 1, device, null, null, null);

                queue = clCreateCommandQueue(context, device[0], 0, null);

                StringBuffer sb  = new StringBuffer();
                URL resource = HistogramBankStride4kbslm.class.getResource(KERNEL_PATH) ;
                String path = Paths.get(resource.toURI()).toFile().getAbsolutePath();
                Scanner sc = new Scanner(new File(path));
                while(sc.hasNext()) {
                        sb.append(sc.nextLine() + "\n");
                }
                sc.close();
                program = clCreateProgramWithSource(context, 1, new String[] {sb.toString()}, null, null);
                String option = "-Werror -DBIN_SIZE=256";
                clBuildProgram(program, 0, null, option, null, null);

                cl_mem data_mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
                                Sizeof.cl_uint4 * DATA_SIZE, Pointer.to(data), null);

                cl_mem subHistogram = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR,
                                Sizeof.cl_uint*BIN_SIZE*WG_SIZE, null, null);

                kernel = clCreateKernel(program, KERNEL_FUNC, null);

                generateSample();

                clSetKernelArg(kernel, 0, Sizeof.cl_mem, Pointer.to(data_mem));
                clSetKernelArg(kernel, 1, Sizeof.cl_uint * (LOCAL_SIZE+1) * BIN_SIZE, null);
                clSetKernelArg(kernel, 2, Sizeof.cl_mem, Pointer.to(subHistogram));

                long[] global_work_size = new long[]{WG_SIZE * LOCAL_SIZE,1,1};
                long[] local_work_size = new long[]{LOCAL_SIZE,1,1};
                clEnqueueNDRangeKernel(queue,
                                kernel, 1,
                                null,
                                global_work_size,
                                local_work_size,
                                0, null, null);

                ByteBuffer output = clEnqueueMapBuffer(queue,
                                subHistogram,
                                CL_TRUE,
                                CL_MAP_WRITE,
                                0,
                                Sizeof.cl_uint*BIN_SIZE*WG_SIZE,
                                0,
                                null,
                                null,
                                null);

                clEnqueueUnmapMemObject(queue, subHistogram, output, 0, null, null);
                clFinish(queue);

                int[] outputHistogram = new int[BIN_SIZE*WG_SIZE];
                int[] result = new int[BIN_SIZE];
                output.order(ByteOrder.LITTLE_ENDIAN);
                int i,j;
                for(i = 0; i < WG_SIZE; ++i) {
                        for(j = 0; j < BIN_SIZE; ++j) {
                                outputHistogram[j*WG_SIZE+i] = output.getInt();
                        }
                }
                for(i = 0; i < WG_SIZE; ++i) {
                        for(j = 0; j < BIN_SIZE; ++j) {
                                result[j] += outputHistogram[j*WG_SIZE+i];
                        }
                }
                for(i = 0; i < BIN_SIZE; ++i) {
                        System.out.printf("TOTAL[%d]: %d\n", i, result[i]);
                }

                clReleaseDevice(device[0]);
                clReleaseContext(context);
                clReleaseCommandQueue(queue);
                clReleaseKernel(kernel);
                clReleaseProgram(program);
        }

        private static void generateSample() {
                //Random rnd = new Random();
                for(int i = 0; i < DATA_SIZE; i++) {
                        //data[i] = (byte) (rnd.nextInt(256));
                        //data[i] = (i%32);
                        data[i] = 254;
                }

        }

}

histogram.cl. 

#define BIN_SIZE 256

//4KB = 4 threads
//16 WG

__kernel void histogram(
                __global uint4* data,
        __local uint* shared_local_memory,
        __global uint* buckets) {

        size_t lid = get_local_id(0);

        size_t group_id = get_group_id(0);
        size_t local_size = get_local_size(0);

        int offset = lid % 2; // 0,1
        int stride = 2 * offset; // 0,2
        int bank_number = lid >> 1; // 0,1

        __local uchar4* input = (__local uchar4*) shared_local_memory;

        for(int i = 0; i < BIN_SIZE/4; ++i)
                input[(local_size+1) * i + lid] = 0;
        barrier(CLK_LOCAL_MEM_FENCE);

        for(int i = 0; i < BIN_SIZE /4; ++i) {

             uint4 value = data[
                        group_id * local_size * (BIN_SIZE / 4) + i *
                        local_size + lid];
             // (0...15) * 4 * 256 / 4 + (0..63) * 4 + lid

             shared_local_memory[value.s0 * (local_size+1) + stride + bank_number]++; // value.s0, s1, s2, s3 is random, may update same position
             shared_local_memory[value.s1 * (local_size+1) + stride + bank_number]++;
             shared_local_memory[value.s2 * (local_size+1) + stride + bank_number]++;
             shared_local_memory[value.s3 * (local_size+1) + stride + bank_number]++;

        }
        barrier(CLK_LOCAL_MEM_FENCE);

        if(lid == 0) { // each work-group only has 1 work-item executing this code block
             for(size_t i = 0; i < BIN_SIZE; i++) {
               int result = 0;
               for(size_t j = 0; j < local_size; j++) {
                 result += shared_local_memory[i * (local_size+1) + j];
                 // [0..255]*4 + 0...3 => min 0, max 1023
               }
               buckets[group_id * BIN_SIZE  + i] = result; // 0..15 * 256 + 0...255
             }
        }
}

Copyright 2018-2019, by Masaki Komatsu