2次元FFTの実装は、各行・各列のスライス(配列)を独立して処理することになります。
もちろん前の項目で紹介した1次元FFTカーネルを各行、各列にそのまま適用することも可能です。ただこの項目では、OpenCLのカーネルのインデックス空間を2次元に拡大して、1つのカーネルで全行、全列を処理する方法を紹介させていただきます。
まずワークサイズについては以下のように2次元として設定します。
private static long[] global_work_size = new long[]{width/2,height,1};
private static long[] local_work_size = new long[]{1,1,1};一つ目の次元のワークサイズがwidth/2に設定していますが、FFTでは処理点が全データの要素の半分となるため、ワークサイズ(インデックス空間)についても半分としています。
__kernel void fft2d(
__global float2* F,
int N,
int sign)
{
int x = get_global_id(0);
int row_size = get_global_size(0)*2;
int y = get_global_id(1);
int stride = N/2;
float floor_adjust = x/stride;
int index = ceil(floor_adjust)*stride + (x%row_size);
float2 in0, in1;
in0 = F[y*row_size + index];
in1 = F[y*row_size + index+stride];
float angle = -2*M_PI_F*(index)/N;
float c,s;
float2 v;
float2 tmp0;
c = native_cos(angle);
s = sign*native_sin(angle);
v.x = c * (in1.x) - s * in1.y;
v.y = c * (in1.y) + s * in1.x;
tmp0 = in0;
in0 = tmp0 + v;
in1 = tmp0 - v;
F[y*row_size + index] = in0;
F[y*row_size + index + stride] = in1;
}1次元からの大きな変更点としては各行を一つ一つ処理するために、yとrow_sizeの2つの変数が追加されたことです。
in0 = F[y*row_size + index]; in1 = F[y*row_size + index+stride];
この記述では、Fは2次元行列でindexは行の中の要素に対応します。またy*row_sizeは、y行を指定します。
擬似フロー(図13.2「図:2次元FFTの手順」)にある通り、2D-FFTの手順には、4回のビット反転、2回の行列転置を行う必要があります。
実装例では以下のような流れで処理をしています。
bit_reverse(data_mem); //(1) fft_2d(width, data_mem,processed_mem,1,true); //(2) transpose(processed_mem, transpose_mem); //(3) bit_reverse(transpose_mem); //(4) fft_2d(height,null,transpose_mem,1,false); //(5) bit_reverse(transpose_mem); //(6) fft_2d(height,null,transpose_mem,-1,false); //(7) fft_inverse2d(height,transpose_mem); //(8) transpose(transpose_mem, processed_mem); //(9) bit_reverse(processed_mem); //(10) fft_2d(width,null,processed_mem,-1,false); //(11) fft_inverse2d(width,processed_mem); //(12)
元データ(data_mem)のビット反転。 | |
元データを変換して、処理済みデータ(processed_mem)に書き込み。 | |
処理済みデータ(processed_mem)を転置して、転置データ(transpose_mem)に書き込み。 | |
転置データ(transpose_mem)のビット反転。 | |
転置データ(transpose_mem)のFFTをして転置データ(transpose_mem)を更新。 | |
転置データ(transpose_mem)のビット反転。 | |
転置データ(transpose_mem)の逆FFTをして転置データ(transpose_mem)を更新。 | |
転置データ(transpose_mem)にスケールファクターをかけて値を調整します。 | |
転置データ(transpose_mem)を転置して、処理済みデータ(processed_mem)に書き込み。 | |
処理済みデータ(processed_mem)のビット反転。 | |
処理済みデータ(processed_mem)のFFTをしてデータを更新。 | |
処理済みデータ(processed_mem)にスケールファクターをかけて値を調整します。 |
bit_reverse関数がビット反転、transpose関数が転置、fft_2dが高速フーリエ変換(第4引数を-1にすると逆変換)、fft_inverse2dが逆フーリエ変換のスケーリングに対応します。
fft_inverse2dは以下のカーネル関数に対応します。
__kernel void fft_inverse2d(int N, __global float2* F) {
size_t x = get_global_id(0);
size_t row_size = get_global_size(0);
size_t y = get_global_id(1);
F[y*row_size + x] /= N;
}この関数は単純に全ての要素に処理するデータサイズ(N)で割り算をしています。
FFTGPU2D.java.
package com.book.jocl.fft;
import static org.jocl.CL.*;
import java.io.File;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.file.Paths;
import java.util.Scanner;
import org.jocl.CL;
import org.jocl.Pointer;
import org.jocl.Sizeof;
import org.jocl.cl_command_queue;
import org.jocl.cl_context;
import org.jocl.cl_context_properties;
import org.jocl.cl_device_id;
import org.jocl.cl_kernel;
import org.jocl.cl_mem;
import org.jocl.cl_platform_id;
import org.jocl.cl_program;
public class FFTGPU2D {
private static final String KERNEL_PATH = "fft2d.cl";
private static final String KERNEL_INIT = "fft_init";
private static final String KERNEL_BIT_REVERSAL = "bit_reversal";
private static final String KERNEL_FFT = "fft2d";
private static final String KERNEL_FFT_INVERSE = "fft_inverse2d";
//private static final String KERNEL_FFT_TRANSPOSE = "async_transpose";
private static final String KERNEL_TRANSPOSE = "transpose";
private static cl_context context;
private static cl_command_queue queue;
private static cl_program program;
private static cl_kernel kernel_init;
private static cl_kernel kernel_bit_reversal;
private static cl_kernel kernel_fft;
private static cl_kernel kernel_fft_inverse;
//private static cl_kernel kernel_fft_transpose;
private static cl_kernel kernel_transpose;
private static final int width = 16;
private static final int height = 16;
private static final int DATA_SIZE = width*height;
private static final int LOCAL_SIZE = 4;
private static final float[] data = new float[DATA_SIZE << 1];
private static final float[] processed_data = new float[DATA_SIZE << 1];
private static long[] global_work_size = new long[]{width/2,height,1};
private static long[] local_work_size = new long[]{1,1,1};
private static long[] global_work_size_rect = new long[]{width,height,1};
private static long[] local_work_size_rect = new long[]{LOCAL_SIZE,LOCAL_SIZE,1};
private static int log2(int b) {
int result = 0;
if((b & 0xffff0000) != 0) {
b >>>= 16;
result = 16;
}
if(b >= 256) {
b >>>= 8;
result += 8;
}
if(b >= 16) {
b >>>= 4;
result += 4;
}
if(b >= 4) {
b >>>= 2;
result += 2;
}
return result + (b >>> 1);
}
public static void main(String[] args) throws Exception {
CL.setExceptionsEnabled(true);
cl_platform_id[] platform = new cl_platform_id[1];
cl_device_id[] device = new cl_device_id[1];
int[] num_devices = new int[1];
clGetPlatformIDs(1, platform, null);
clGetDeviceIDs(platform[0], CL_DEVICE_TYPE_GPU, 1, device, num_devices);
cl_context_properties props = new cl_context_properties();
props.addProperty(CL_CONTEXT_PLATFORM, platform[0]);
context = clCreateContext(props, 1, device, null, null, null);
queue = clCreateCommandQueue(context, device[0], 0, null);
StringBuffer sb = new StringBuffer();
URL resource = FFTGPU1D.class.getResource(KERNEL_PATH) ;
String path = Paths.get(resource.toURI()).toFile().getAbsolutePath();
Scanner sc = new Scanner(new File(path));
while(sc.hasNext()) {
sb.append(sc.nextLine() + "\n");
}
sc.close();
program = clCreateProgramWithSource(context, 1, new String[] {sb.toString()}, null, null);
StringBuffer op = new StringBuffer();
op.append("-Werror -Dsize=");
op.append(LOCAL_SIZE);
String option = op.toString();
clBuildProgram(program, 0, null, option, null, null);
cl_mem data_mem;
cl_mem processed_mem;
cl_mem transpose_mem;
data_mem = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
Sizeof.cl_float2 * DATA_SIZE, Pointer.to(data), null);
processed_mem = clCreateBuffer(context, CL_MEM_USE_HOST_PTR,
Sizeof.cl_float2 * DATA_SIZE, Pointer.to(processed_data), null);
transpose_mem = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR,
Sizeof.cl_float2 * DATA_SIZE, null, null);
kernel_init = clCreateKernel(program, KERNEL_INIT, null);
kernel_bit_reversal = clCreateKernel(program, KERNEL_BIT_REVERSAL, null);
kernel_fft = clCreateKernel(program, KERNEL_FFT, null);
kernel_fft_inverse = clCreateKernel(program, KERNEL_FFT_INVERSE, null);
kernel_transpose = clCreateKernel(program, KERNEL_TRANSPOSE, null);
generateSample();
bit_reverse(data_mem);
fft_2d(width, data_mem,processed_mem,1,true);
transpose(processed_mem, transpose_mem);
bit_reverse(transpose_mem);
fft_2d(height,null,transpose_mem,1,false);
bit_reverse(transpose_mem);
fft_2d(height,null,transpose_mem,-1,false);
fft_inverse2d(height,transpose_mem);
transpose(transpose_mem, processed_mem);
bit_reverse(processed_mem);
fft_2d(width,null,processed_mem,-1,false);
fft_inverse2d(width,processed_mem);
System.out.println("COMPLETE");
ByteBuffer outBuffer = clEnqueueMapBuffer(queue,
processed_mem,
CL_TRUE,
CL_MAP_WRITE,
0,
Sizeof.cl_float2*width*height,
0,
null,
null,
null);
clEnqueueUnmapMemObject(queue, processed_mem, outBuffer, 0, null, null);
clFinish(queue);
outBuffer.order(ByteOrder.LITTLE_ENDIAN);
for(int i = 0; i < width*height*2; i+=2) {
System.out.println(outBuffer.getFloat());
outBuffer.getFloat();
}
clReleaseDevice(device[0]);
clReleaseContext(context);
clReleaseCommandQueue(queue);
clReleaseKernel(kernel_fft);
clReleaseKernel(kernel_init);
clReleaseProgram(program);
}
private static void generateSample() {
for(int i = 0; i < width*height*2; i+=2) {
data[i] = i;
data[i+1] = 0.0f;
}
}
private static void transpose(cl_mem input_mem, cl_mem output_mem) {
int[] widthPtr = new int[1];
widthPtr[0]=width;
int[] heightPtr = new int[1];
heightPtr[0]=height;
clSetKernelArg(kernel_transpose, 0, Sizeof.cl_mem, Pointer.to(input_mem));
clSetKernelArg(kernel_transpose, 1, Sizeof.cl_mem, Pointer.to(output_mem));
clSetKernelArg(kernel_transpose, 2, Sizeof.cl_int, Pointer.to(widthPtr));
clSetKernelArg(kernel_transpose, 3, Sizeof.cl_int, Pointer.to(heightPtr));
clEnqueueNDRangeKernel(queue,
kernel_transpose, 2, null,
global_work_size_rect,
local_work_size_rect,
0, null, null);
}
private static void fft_inverse2d(int N, cl_mem inverse_mem) {
int[] Ni = new int[1];
Ni[0] = N;
clSetKernelArg(kernel_fft_inverse, 0, Sizeof.cl_int, Pointer.to(Ni));
clSetKernelArg(kernel_fft_inverse, 1, Sizeof.cl_mem, Pointer.to(inverse_mem));
clEnqueueNDRangeKernel(queue,
kernel_fft_inverse, 2, null,
global_work_size_rect,
local_work_size,
0, null, null);
}
private static void bit_reverse(cl_mem input_mem) {
int N = DATA_SIZE;
int[] Ni = new int[1];
Ni[0] = N;
clSetKernelArg(kernel_bit_reversal, 0, Sizeof.cl_mem, Pointer.to(input_mem));
clSetKernelArg(kernel_bit_reversal, 1, Sizeof.cl_uint, Pointer.to(Ni));
clEnqueueNDRangeKernel(queue,
kernel_bit_reversal, 2, null,
global_work_size_rect,
local_work_size,
0, null, null);
}
private static void fft_2d(int N, cl_mem input_mem, cl_mem output_mem, int sign, boolean is_init){
int fftSize = 1;
int ns = log2(N);
int[] fftSizePtr = new int[1];
int[] signPtr = new int[1];
signPtr[0] = sign;
for(int i = 0; i < ns; i++) {
fftSize <<= 1;
fftSizePtr[0] = fftSize;
if(is_init == false || fftSize !=2) {
clSetKernelArg(kernel_fft, 0, Sizeof.cl_mem, Pointer.to(output_mem));
clSetKernelArg(kernel_fft, 1, Sizeof.cl_uint, Pointer.to(fftSizePtr));
clSetKernelArg(kernel_fft, 2, Sizeof.cl_int, Pointer.to(signPtr));
clEnqueueNDRangeKernel(queue,
kernel_fft, 2, null,
global_work_size,
local_work_size,
0, null, null);
} else {
clSetKernelArg(kernel_init, 0, Sizeof.cl_mem, Pointer.to(input_mem));
clSetKernelArg(kernel_init, 1, Sizeof.cl_mem, Pointer.to(output_mem));
clSetKernelArg(kernel_init, 2, Sizeof.cl_uint, Pointer.to(fftSizePtr));
clEnqueueNDRangeKernel(queue,
kernel_init, 2, null,
global_work_size,
local_work_size,
0, null, null);
}
}
}
}
fft2d.cl.
inline int reverseBit(int x, int stage) {
int b = 0;
int bits = stage;
while (bits != 0){
b <<=1;
b |=( x &1 );
x >>=1;
bits>>=1;
}
return b;
}
__kernel void bit_reversal(__global float2* data, uint N) {
size_t x = get_global_id(0);
size_t row_size = get_global_size(0);
size_t y = get_global_id(1);
uint rev = reverseBit(x, row_size-1);
float2 in1;
float2 in2;
if(x < rev) {
in1 = data[y*row_size + x];
in2 = data[y*row_size + rev];
printf("x=%d, y=%d, pair: %d - %d, rs = %d, di=%d\n", x, y, x, rev, row_size,y*row_size + x);
data[y*row_size + rev] = in1;
data[y*row_size + x] = in2;
}
}
__kernel void fft_init(
__global float2* data,
__global float2* F,
int N)
{
int x = get_global_id(0);
int row_size = get_global_size(0)*2;
int y = get_global_id(1);
int stride = N/2;
float floor_adjust = x/stride;
int index = ceil(floor_adjust)*stride + (x%row_size);
float2 in0, in1;
in0 = data[y*row_size + index];
in1 = data[y*row_size + index+stride];
float2 v0;
v0 = in0;
in0 = v0 + in1;
in1 = v0 - in1;
F[y*row_size + index] = in0;
F[y*row_size + index + stride] = in1;
//printf("x=%d, y=%d, rs=%d, di=%d, pair: %d - %d, N = %d, s = %d, in0:in1 = %f:%f\n", x,y,row_size,y*row_size + index, index, index+stride, N, stride, F[y*row_size + index].x, F[y*row_size + index+stride].x);
}
__kernel void fft2d(
__global float2* F,
int N,
int sign)
{
int x = get_global_id(0);
int row_size = get_global_size(0)*2;
int y = get_global_id(1);
int stride = N/2;
float floor_adjust = x/stride;
int index = ceil(floor_adjust)*stride + (x%row_size);
float2 in0, in1;
in0 = F[y*row_size + index];
in1 = F[y*row_size + index+stride];
float angle = -2*M_PI_F*(index)/N;
float c,s;
float2 v;
float2 tmp0;
c = native_cos(angle);
s = sign*native_sin(angle);
v.x = c * (in1.x) - s * in1.y;
v.y = c * (in1.y) + s * in1.x;
tmp0 = in0;
in0 = tmp0 + v;
in1 = tmp0 - v;
F[y*row_size + index] = in0;
F[y*row_size + index + stride] = in1;
//printf("x=%d, y=%d, pair: %d - %d, N = %d, s = %d, sign = %d c:s = %f:%f\n in0:in1 = %f:%f\n", x,y, y*row_size + index, y*row_size + index+stride, N, stride, sign, c, s, F[y*row_size + index].x, F[y*row_size + index+stride].x);
}
__kernel void fft_inverse2d(int N, __global float2* F) {
size_t x = get_global_id(0);
size_t row_size = get_global_size(0);
size_t y = get_global_id(1);
F[y*row_size + x] /= N;
//printf("di=%d, F[]=%f\n",y*row_size + x,F[y*row_size + x].x);
}
__kernel void transpose(
__global float2* input,
__global float2* output,
size_t width,
size_t height)
{
__local float2 tile[size * (size+1)];
size_t x = get_global_id(0);
size_t y = get_global_id(1);
size_t lx = get_local_id(0);
size_t ly = get_local_id(1);
size_t gx = get_group_id(0);
size_t gy = get_group_id(1);
size_t index_input = y * width + x;
size_t index_tile = ly * (size+1) + lx;
tile[index_tile]= input[index_input];
barrier(CLK_LOCAL_MEM_FENCE);
size_t ox = gy * size + lx;
size_t oy = gx * size + ly;
size_t index_output = oy * height + ox;
index_tile = lx * (size+1) + ly;
output[index_output] = tile[index_tile];
//printf("output[%d %d %d %d]=%f, index_tile=%d, index_out=%d\n",x,y,gx,gy,output[index_output].x, index_tile, index_output);
}
Copyright 2018-2019, by Masaki Komatsu