Last active
October 5, 2021 06:52
-
-
Save morgwai/1f062c513512754751abe0311d92a0cd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Copyright (c) Piotr Morgwai Kotarbinski, Licensed under the Apache License, Version 2.0 | |
package pl.morgwai.sample; | |
import com.aparapi.Kernel; | |
import com.aparapi.Range; | |
/** | |
* Performs pointer-jumping on an array using a GPU with Aparapi. By default sums values from the | |
* array, but subclasses may override {@link #accumulateValue(int, int)} to do something else. | |
* <p>Usage:</p> | |
* <pre> | |
* double[] myArray = getDoubleArray(); | |
* double sum = PointerJumpingKernel.calculateSum(myArray);</pre> | |
* <p> | |
* On my integrated Intel GPU ({@code maxWorkWorkGroupSize=256, maxComputeUnits=48}), this works 4 | |
* times slower than sequential adding on the CPU.<br/> | |
* This is probably because the algorithm is memory bound and spends most time on fetching values | |
* from the memory. See | |
* <a href='https://stackoverflow.com/questions/22866901/using-java-with-nvidia-gpus-cuda#22868938'> | |
* this SO answer</a> for more info.<br/> | |
* The other reason may be that <a | |
* href='https://software.intel.com/content/www/us/en/develop/documentation/iocl-opg/top/optimizing-opencl-usage-with-intel-processor-graphics/work-group-size-recommendations-summary.html'> | |
* intel has only 16 barrier registers (and only 64kB local memory <i>shared</i> among running | |
* work-groups)</a>, so only up to 16 work-groups can run in parallel.</p> | |
*/ | |
public class PointerJumpingKernel extends Kernel implements AutoCloseable { | |
public static double calculateSum(double[] input) { | |
try (var kernel = new PointerJumpingKernel()) { | |
return kernel.accumulateArray(input); | |
} | |
} | |
protected PointerJumpingKernel() {} | |
double[] input; // input values to accumulate | |
double[] results; // results from all work-groups | |
/** | |
* Group's local copy of its slice of the {@code input}. | |
*/ | |
@Local protected double[] localSlice; | |
@Local int[] next; // next[i] is a pointer to the next value in localSlice not yet accumulated | |
// by the processing element with localId == i, initially next[i] == i+1 for all i | |
/** | |
* Triggers GPU execution of {@link #run() pointer-jumping} on distinct slices of {@code input} | |
* in separate work-groups, after that recursively accumulates results from all work-groups. | |
* Recursion stops when everything is accumulated into a single value. | |
* @return value accumulated from the whole input. | |
*/ | |
protected double accumulateArray(double[] input) { | |
this.input = input; | |
int groupSize = Math.min(input.length, getTargetDevice().getMaxWorkGroupSize()); | |
int numberOfGroups = input.length / groupSize; | |
if (groupSize * numberOfGroups < input.length) numberOfGroups++; // padding the last group | |
int paddedSize = groupSize * numberOfGroups; | |
results = new double[numberOfGroups]; | |
localSlice = new double[groupSize]; | |
next = new int[groupSize]; | |
execute(Range.create(paddedSize, groupSize)); | |
if (numberOfGroups == 1) return results[0]; | |
return accumulateArray(results); | |
} | |
/** | |
* Pointer-jumping procedure executed by a single processing element. Each work-group first | |
* copies its slice of the {@code input} to {@link #localSlice} array in group's local memory. | |
* Next, the main pointer-jumping loop is performed on the {@link #localSlice}. | |
* Finally, the 1st processing element writes group's accumulated result to {@link #results} | |
* array. | |
*/ | |
@Override | |
public final void run() { | |
int i = getLocalId(); | |
int globalIndex = getGlobalId(); | |
int groupSize = getLocalSize(); | |
int acivityIndicator = i;// Divided by 2 at each step of the main loop until odd. | |
// When odd, the given processing element stays idle (just checks-in at the barrier) | |
// copy group's slice into local memory and initialize next pointers | |
if (globalIndex < input.length - 1) { | |
next[i] = i + 1; | |
localSlice[i] = input[globalIndex]; | |
} else { | |
next[i] = getGlobalSize(); // padding in the last group: point beyond the array | |
if (globalIndex == input.length - 1) localSlice[i] = input[globalIndex]; | |
} | |
localBarrier(); | |
// main pointer-jumping loop | |
while (next[0] < groupSize) { // run until the whole group is accumulated at index 0 | |
if ( (acivityIndicator & 1) == 0 && next[i] < groupSize) { | |
accumulateValue(next[i], i); | |
next[i] = next[next[i]]; | |
acivityIndicator >>= 1; | |
} | |
localBarrier(); | |
} | |
if (i == 0) results[getGroupId()] = localSlice[0]; | |
} | |
/** | |
* Accumulates value from {@code fromIndex} in {@link #localSlice} into {@code intoIndex}. | |
* Subclasses may override this method to do something else than summing. | |
* Subclasses should then provide a static method that creates a kernel and calls | |
* {@link #accumulateArray(double[], int)} similarly to {@link #calculateSum(double[])}. | |
*/ | |
protected void accumulateValue(int fromIndex, int intoIndex) { | |
localSlice[intoIndex] += localSlice[fromIndex]; | |
} | |
@Override | |
public final void close() { | |
dispose(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment