morgwai · October 5, 2021 06:52
diff --git a/PointerJumpingKernel.java b/PointerJumpingKernel.java
 // Copyright (c) Piotr Morgwai Kotarbinski, Licensed under the Apache License, Version 2.0
 package pl.morgwai.sample;

 import com.aparapi.Kernel;
 import com.aparapi.Range;



 /**
 * Performs pointer-jumping on an array using a GPU with Aparapi. By default sums values from the
 * array, but subclasses may override {@link #accumulateValue(int, int)} to do something else.
 * <p>Usage:</p>
 * <pre>
 * double[] myArray = getDoubleArray();
 * double sum = PointerJumpingKernel.calculateSum(myArray);</pre>
 * <p>
 * On my integrated Intel GPU ({@code maxWorkWorkGroupSize=256, maxComputeUnits=48}), this works 4
 * times slower than sequential adding on the CPU.<br/>
 * This is probably because the algorithm is memory bound and spends most time on fetching values
 * from the memory. See
 * <a href='https://stackoverflow.com/questions/22866901/using-java-with-nvidia-gpus-cuda#22868938'>
 * this SO answer</a> for more info.<br/>
 * The other reason may be that <a
 * href='https://software.intel.com/content/www/us/en/develop/documentation/iocl-opg/top/optimizing-opencl-usage-with-intel-processor-graphics/work-group-size-recommendations-summary.html'>
 * intel has only 16 barrier registers (and only 64kB local memory <i>shared</i> among running
 * work-groups)</a>, so only up to 16 work-groups can run in parallel.</p>
 */
 public class PointerJumpingKernel extends Kernel implements AutoCloseable {



 	public static double calculateSum(double[] input) {
 		try (var kernel = new PointerJumpingKernel()) {
 			return kernel.accumulateArray(input);
 		}
 	}

 	protected PointerJumpingKernel() {}



 	double[] input;  // input values to accumulate
 	double[] results; // results from all work-groups

 	/**
 	 * Group's local copy of its slice of the {@code input}.
 	 */
 	@Local protected double[] localSlice;
 	@Local int[] next; // next[i] is a pointer to the next value in localSlice not yet accumulated
 			// by the processing element with localId == i, initially next[i] == i+1 for all i



 	/**
 	 * Triggers GPU execution of {@link #run() pointer-jumping} on distinct slices of {@code input}
 	 * in separate work-groups, after that recursively accumulates results from all work-groups.
 	 * Recursion stops when everything is accumulated into a single value.
 	 * @return value accumulated from the whole input.
 	 */
 	protected double accumulateArray(double[] input) {
 		this.input = input;
 		int groupSize = Math.min(input.length, getTargetDevice().getMaxWorkGroupSize());
 		int numberOfGroups = input.length / groupSize;
 		if (groupSize * numberOfGroups < input.length) numberOfGroups++;  // padding the last group
 		int paddedSize = groupSize * numberOfGroups;
 		results = new double[numberOfGroups];
 		localSlice = new double[groupSize];
 		next = new int[groupSize];
 		execute(Range.create(paddedSize, groupSize));
 		if (numberOfGroups == 1) return results[0];
 		return accumulateArray(results);
 	}



 	/**
 	 * Pointer-jumping procedure executed by a single processing element. Each work-group first
 	 * copies its slice of the {@code input} to {@link #localSlice} array in group's local memory.
 	 * Next, the main pointer-jumping loop is performed on the {@link #localSlice}.
 	 * Finally, the 1st processing element writes group's accumulated result to {@link #results}
 	 * array.
 	 */
 	@Override
 	public final void run() {
 		int i = getLocalId();
 		int globalIndex = getGlobalId();
 		int groupSize = getLocalSize();
 		int acivityIndicator = i;// Divided by 2 at each step of the main loop until odd.
 				// When odd, the given processing element stays idle (just checks-in at the barrier)

 		// copy group's slice into local memory and initialize next pointers
 		if (globalIndex < input.length - 1) {
 			next[i] = i + 1;
 			localSlice[i] = input[globalIndex];
 		} else {
 			next[i] = getGlobalSize(); // padding in the last group: point beyond the array
 			if (globalIndex == input.length - 1) localSlice[i] = input[globalIndex];
 		}
 		localBarrier();

 		// main pointer-jumping loop
 		while (next[0] < groupSize) { // run until the whole group is accumulated at index 0
 			if ( (acivityIndicator & 1) == 0 && next[i] < groupSize) {
 				accumulateValue(next[i], i);
 				next[i] = next[next[i]];
 				acivityIndicator >>= 1;
 			}
 			localBarrier();
 		}
 		if (i == 0) results[getGroupId()] = localSlice[0];
 	}

 	/**
 	 * Accumulates value from {@code fromIndex} in {@link #localSlice} into {@code intoIndex}.
 	 * Subclasses may override this method to do something else than summing.
 	 * Subclasses should then provide a static method that creates a kernel and calls
 	 * {@link #accumulateArray(double[], int)} similarly to {@link #calculateSum(double[])}.
 	 */
 	protected void accumulateValue(int fromIndex, int intoIndex) {
 		localSlice[intoIndex] += localSlice[fromIndex];
 	}



 	@Override
 	public final void close() {
 		dispose();
 	}
 }
	// Copyright (c) Piotr Morgwai Kotarbinski, Licensed under the Apache License, Version 2.0
	package pl.morgwai.sample;

	import com.aparapi.Kernel;
	import com.aparapi.Range;



	/**
	* Performs pointer-jumping on an array using a GPU with Aparapi. By default sums values from the
	* array, but subclasses may override {@link #accumulateValue(int, int)} to do something else.
	* <p>Usage:</p>
	* <pre>
	* double[] myArray = getDoubleArray();
	* double sum = PointerJumpingKernel.calculateSum(myArray);</pre>
	* <p>
	* On my integrated Intel GPU ({@code maxWorkWorkGroupSize=256, maxComputeUnits=48}), this works 4
	* times slower than sequential adding on the CPU.<br/>
	* This is probably because the algorithm is memory bound and spends most time on fetching values
	* from the memory. See
	* <a href='https://stackoverflow.com/questions/22866901/using-java-with-nvidia-gpus-cuda#22868938'>
	* this SO answer</a> for more info.<br/>
	* The other reason may be that <a
	* href='https://software.intel.com/content/www/us/en/develop/documentation/iocl-opg/top/optimizing-opencl-usage-with-intel-processor-graphics/work-group-size-recommendations-summary.html'>
	* intel has only 16 barrier registers (and only 64kB local memory <i>shared</i> among running
	* work-groups)</a>, so only up to 16 work-groups can run in parallel.</p>
	*/
	public class PointerJumpingKernel extends Kernel implements AutoCloseable {



	public static double calculateSum(double[] input) {
	try (var kernel = new PointerJumpingKernel()) {
	return kernel.accumulateArray(input);
	}
	}

	protected PointerJumpingKernel() {}



	double[] input; // input values to accumulate
	double[] results; // results from all work-groups

	/**
	* Group's local copy of its slice of the {@code input}.
	*/
	@Local protected double[] localSlice;
	@Local int[] next; // next[i] is a pointer to the next value in localSlice not yet accumulated
	// by the processing element with localId == i, initially next[i] == i+1 for all i



	/**
	* Triggers GPU execution of {@link #run() pointer-jumping} on distinct slices of {@code input}
	* in separate work-groups, after that recursively accumulates results from all work-groups.
	* Recursion stops when everything is accumulated into a single value.
	* @return value accumulated from the whole input.
	*/
	protected double accumulateArray(double[] input) {
	this.input = input;
	int groupSize = Math.min(input.length, getTargetDevice().getMaxWorkGroupSize());
	int numberOfGroups = input.length / groupSize;
	if (groupSize * numberOfGroups < input.length) numberOfGroups++; // padding the last group
	int paddedSize = groupSize * numberOfGroups;
	results = new double[numberOfGroups];
	localSlice = new double[groupSize];
	next = new int[groupSize];
	execute(Range.create(paddedSize, groupSize));
	if (numberOfGroups == 1) return results[0];
	return accumulateArray(results);
	}



	/**
	* Pointer-jumping procedure executed by a single processing element. Each work-group first
	* copies its slice of the {@code input} to {@link #localSlice} array in group's local memory.
	* Next, the main pointer-jumping loop is performed on the {@link #localSlice}.
	* Finally, the 1st processing element writes group's accumulated result to {@link #results}
	* array.
	*/
	@Override
	public final void run() {
	int i = getLocalId();
	int globalIndex = getGlobalId();
	int groupSize = getLocalSize();
	int acivityIndicator = i;// Divided by 2 at each step of the main loop until odd.
	// When odd, the given processing element stays idle (just checks-in at the barrier)

	// copy group's slice into local memory and initialize next pointers
	if (globalIndex < input.length - 1) {
	next[i] = i + 1;
	localSlice[i] = input[globalIndex];
	} else {
	next[i] = getGlobalSize(); // padding in the last group: point beyond the array
	if (globalIndex == input.length - 1) localSlice[i] = input[globalIndex];
	}
	localBarrier();

	// main pointer-jumping loop
	while (next[0] < groupSize) { // run until the whole group is accumulated at index 0
	if ( (acivityIndicator & 1) == 0 && next[i] < groupSize) {
	accumulateValue(next[i], i);
	next[i] = next[next[i]];
	acivityIndicator >>= 1;
	}
	localBarrier();
	}
	if (i == 0) results[getGroupId()] = localSlice[0];
	}

	/**
	* Accumulates value from {@code fromIndex} in {@link #localSlice} into {@code intoIndex}.
	* Subclasses may override this method to do something else than summing.
	* Subclasses should then provide a static method that creates a kernel and calls
	* {@link #accumulateArray(double[], int)} similarly to {@link #calculateSum(double[])}.
	*/
	protected void accumulateValue(int fromIndex, int intoIndex) {
	localSlice[intoIndex] += localSlice[fromIndex];
	}



	@Override
	public final void close() {
	dispose();
	}
	}