kekcsi · February 15, 2020 14:14 · kekcsi · Feb 15, 2020
diff --git a/BigQ.java b/BigQ.java
 import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.RandomAccessFile;
 import java.util.ArrayDeque;
 import java.util.Arrays;
 import java.util.Deque;

 /**
 * -- Big Q Sort --
 *
 * A variation of Quicksort algorithm for in-place sorting of data that does not completely fit into RAM.
 *
 * The basic idea of this generalization is that pointers used for the partitioning are file block indexes.
 * A step of the original Quicksort algorithm takes two records to compare and conditionally swaps them.
 * This algorithm on the other hand, will take two blocks of records to fill a buffer in memory.
 * Instead of swapping, it runs a sorting algorithm on the buffer and then blocks are written back to the file
 * before the corresponding pointer is moved.
 *
 * Pointers used for the partitioning are file block indexes but pivot is still a single record like in Quicksort.
 *
 * Otherwise the program flow is similar to that of Quicksort.
 *
 * -- This Implementation --
 *
 * Pivot selection saves block reads by selecting a record from the memory, which may not be an optimal pivot.
 * If there are repeated keys, the actual record that plays the pivot role may change while partitioning in this
 * implementation, it is only its key that needs to stay constant.
 *
 * In the demonstration, records are bytes of the file "sort_me.txt", key is the signed value, comparator is <
 *
 * The buffer is assumed to be smaller than the file.
 *
 * The output file will be a multiple of the block size, therefore program adds some null bytes and orders them. In a
 * production implementation this would be bad; but there is no need to complicate this POC to prevent the issue.
 */
 public class BigQ {
    private static final int BLOCK_SIZE = 11;
    private static byte[] buffer = new byte[2*BLOCK_SIZE];
    private static final File SORT_ME = new File("sort_me.txt");

    static class BlockInterval {
        long low;
        long high;

        public BlockInterval(long low, long high) {
            assert low < high;
            this.low = low;
            this.high = high;
        }
    }

    public static void main(String[] args) throws IOException {
        BlockFile blockFile = new BlockFile(SORT_ME, "rws", BLOCK_SIZE, buffer);

        // divide et impera starts with the whole file as interval to sort
        Deque<BlockInterval> openList = new ArrayDeque<>();
        openList.push(new BlockInterval(0L, blockFile.nBlocks - 1));

        do {
            BlockInterval outerBounds = openList.poll();

            // initial fill of buffer (both halves)
            assert outerBounds.low < outerBounds.high;   // assuming at least 2 blocks span
            blockFile.readBlock(outerBounds.low, BufferHalf.LEFT);
            blockFile.readBlock(outerBounds.high, BufferHalf.RIGHT);
            Arrays.sort(buffer);

            if (outerBounds.low + 1 < outerBounds.high) {
                // interval that does not completely fit into one buffer - partitioning needed
                long doneBelow = outerBounds.low;
                long doneAbove = outerBounds.high;

                // select pivot (does not affect correctness but efficiency)
                int pivotIndex = BLOCK_SIZE - (int)System.currentTimeMillis()%2;
                byte pivotKey = buffer[pivotIndex];

                partitioning:
                while (true) { // buffer is sorted at this point
                    if (pivotIndex >= BLOCK_SIZE) {
                        // left half of buffer contains sub-pivot keys only, pivot is in the right half
                        blockFile.writeBlock(doneBelow, BufferHalf.LEFT);
                        doneBelow++;

                        if (doneBelow >= doneAbove) {
                            // done pointers collide, conclude partitioning by writing out the rest of records
                            blockFile.writeBlock(doneAbove, BufferHalf.RIGHT);
                            break partitioning;
                        }

                        blockFile.readBlock(doneBelow, BufferHalf.LEFT);
                        Arrays.sort(buffer);

                        // by restoring sortedness of the buffer, pivot may have moved some positions left
                        while (buffer[pivotIndex] > pivotKey) {
                            pivotIndex--;
                        }
                    } else {
                        // right half of buffer contains super-pivot keys only, pivot is in the left half
                        blockFile.writeBlock(doneAbove, BufferHalf.RIGHT);
                        doneAbove--;

                        if (doneAbove <= doneBelow) {
                            // done pointers collide, conclude partitioning by writing out the rest of records
                            blockFile.writeBlock(doneBelow, BufferHalf.LEFT);
                            break partitioning;
                        }

                        blockFile.readBlock(doneAbove, BufferHalf.RIGHT);
                        Arrays.sort(buffer);

                        // by restoring sortedness of the buffer, pivot may have moved some positions right
                        while (buffer[pivotIndex] < pivotKey) {
                            pivotIndex++;
                        }
                    }
                }

                // found the final block for the pivot (may still move in the block since the block contains 0 or more
                // lower keys than the pivot, 1 or more pivot keys, and 0 or more higher keys)
                assert doneBelow == doneAbove;

                // when all records of a block are settled, exclude that block from further processing
                if (doneBelow == outerBounds.high && buffer[BLOCK_SIZE] == pivotKey) {
                    doneBelow--;
                }
                if (doneAbove == outerBounds.low && buffer[BLOCK_SIZE - 1] == pivotKey) {
                    doneAbove++;
                }

                // keep partly processed and undone blocks in circulation
                if (doneBelow > outerBounds.low) {
                    openList.push(new BlockInterval(outerBounds.low, doneBelow));
                }
                if (doneAbove < outerBounds.high) {
                    openList.push(new BlockInterval(doneAbove, outerBounds.high));
                }
            } else {
                // degenerate interval of 2 blocks - sort already done in memory, just persist the result
                blockFile.writeBlock(outerBounds.low, BufferHalf.LEFT);
                blockFile.writeBlock(outerBounds.high, BufferHalf.RIGHT);
            }
        } while (!openList.isEmpty());

        blockFile.close();
    }
 }

 enum BufferHalf {
    LEFT(0),
    RIGHT(1);

    int i;

    BufferHalf(int i) {
        this.i = i;
    }
 }

 class BlockFile extends RandomAccessFile {
    long nBlocks;
    private final int blockSize;
    private final byte[] buffer;

    // analysis stuff
    static int readCount = 0, writeCount = 0;

    public BlockFile(File file, String mode, int blockSize, byte[] buffer) throws FileNotFoundException {
        super(file, mode);
        this.blockSize = blockSize;
        this.buffer = buffer;
        nBlocks = (file.length() + blockSize - 1)/blockSize;
    }

    public void readBlock(long blockIndex, BufferHalf bufferHalf) throws IOException {
        seek(blockSize*blockIndex);
        read(buffer, blockSize*bufferHalf.i, blockSize);
        readCount++;
    }

    public void writeBlock(long blockIndex, BufferHalf bufferHalf) throws IOException {
        seek(blockSize*blockIndex);
        write(buffer, blockSize*bufferHalf.i, blockSize);
        writeCount++;
    }

    @Override
    public void close() throws IOException {
        super.close();

        System.out.println("N = " + nBlocks);
        System.out.println("N*log(N) = " + nBlocks*Math.log(nBlocks)/0.2);
        System.out.println("N*N/2 = " + nBlocks*nBlocks/2);
        System.out.println(readCount + " reads");
        System.out.println(writeCount + " writes");
    }
 }
	import java.io.File;
	import java.io.FileNotFoundException;
	import java.io.IOException;
	import java.io.RandomAccessFile;
	import java.util.ArrayDeque;
	import java.util.Arrays;
	import java.util.Deque;

	/**
	* -- Big Q Sort --
	*
	* A variation of Quicksort algorithm for in-place sorting of data that does not completely fit into RAM.
	*
	* The basic idea of this generalization is that pointers used for the partitioning are file block indexes.
	* A step of the original Quicksort algorithm takes two records to compare and conditionally swaps them.
	* This algorithm on the other hand, will take two blocks of records to fill a buffer in memory.
	* Instead of swapping, it runs a sorting algorithm on the buffer and then blocks are written back to the file
	* before the corresponding pointer is moved.
	*
	* Pointers used for the partitioning are file block indexes but pivot is still a single record like in Quicksort.
	*
	* Otherwise the program flow is similar to that of Quicksort.
	*
	* -- This Implementation --
	*
	* Pivot selection saves block reads by selecting a record from the memory, which may not be an optimal pivot.
	* If there are repeated keys, the actual record that plays the pivot role may change while partitioning in this
	* implementation, it is only its key that needs to stay constant.
	*
	* In the demonstration, records are bytes of the file "sort_me.txt", key is the signed value, comparator is <
	*
	* The buffer is assumed to be smaller than the file.
	*
	* The output file will be a multiple of the block size, therefore program adds some null bytes and orders them. In a
	* production implementation this would be bad; but there is no need to complicate this POC to prevent the issue.
	*/
	public class BigQ {
	private static final int BLOCK_SIZE = 11;
	private static byte[] buffer = new byte[2*BLOCK_SIZE];
	private static final File SORT_ME = new File("sort_me.txt");

	static class BlockInterval {
	long low;
	long high;

	public BlockInterval(long low, long high) {
	assert low < high;
	this.low = low;
	this.high = high;
	}
	}

	public static void main(String[] args) throws IOException {
	BlockFile blockFile = new BlockFile(SORT_ME, "rws", BLOCK_SIZE, buffer);

	// divide et impera starts with the whole file as interval to sort
	Deque<BlockInterval> openList = new ArrayDeque<>();
	openList.push(new BlockInterval(0L, blockFile.nBlocks - 1));

	do {
	BlockInterval outerBounds = openList.poll();

	// initial fill of buffer (both halves)
	assert outerBounds.low < outerBounds.high; // assuming at least 2 blocks span
	blockFile.readBlock(outerBounds.low, BufferHalf.LEFT);
	blockFile.readBlock(outerBounds.high, BufferHalf.RIGHT);
	Arrays.sort(buffer);

	if (outerBounds.low + 1 < outerBounds.high) {
	// interval that does not completely fit into one buffer - partitioning needed
	long doneBelow = outerBounds.low;
	long doneAbove = outerBounds.high;

	// select pivot (does not affect correctness but efficiency)
	int pivotIndex = BLOCK_SIZE - (int)System.currentTimeMillis()%2;
	byte pivotKey = buffer[pivotIndex];

	partitioning:
	while (true) { // buffer is sorted at this point
	if (pivotIndex >= BLOCK_SIZE) {
	// left half of buffer contains sub-pivot keys only, pivot is in the right half
	blockFile.writeBlock(doneBelow, BufferHalf.LEFT);
	doneBelow++;

	if (doneBelow >= doneAbove) {
	// done pointers collide, conclude partitioning by writing out the rest of records
	blockFile.writeBlock(doneAbove, BufferHalf.RIGHT);
	break partitioning;
	}

	blockFile.readBlock(doneBelow, BufferHalf.LEFT);
	Arrays.sort(buffer);

	// by restoring sortedness of the buffer, pivot may have moved some positions left
	while (buffer[pivotIndex] > pivotKey) {
	pivotIndex--;
	}
	} else {
	// right half of buffer contains super-pivot keys only, pivot is in the left half
	blockFile.writeBlock(doneAbove, BufferHalf.RIGHT);
	doneAbove--;

	if (doneAbove <= doneBelow) {
	// done pointers collide, conclude partitioning by writing out the rest of records
	blockFile.writeBlock(doneBelow, BufferHalf.LEFT);
	break partitioning;
	}

	blockFile.readBlock(doneAbove, BufferHalf.RIGHT);
	Arrays.sort(buffer);

	// by restoring sortedness of the buffer, pivot may have moved some positions right
	while (buffer[pivotIndex] < pivotKey) {
	pivotIndex++;
	}
	}
	}

	// found the final block for the pivot (may still move in the block since the block contains 0 or more
	// lower keys than the pivot, 1 or more pivot keys, and 0 or more higher keys)
	assert doneBelow == doneAbove;

	// when all records of a block are settled, exclude that block from further processing
	if (doneBelow == outerBounds.high && buffer[BLOCK_SIZE] == pivotKey) {
	doneBelow--;
	}
	if (doneAbove == outerBounds.low && buffer[BLOCK_SIZE - 1] == pivotKey) {
	doneAbove++;
	}

	// keep partly processed and undone blocks in circulation
	if (doneBelow > outerBounds.low) {
	openList.push(new BlockInterval(outerBounds.low, doneBelow));
	}
	if (doneAbove < outerBounds.high) {
	openList.push(new BlockInterval(doneAbove, outerBounds.high));
	}
	} else {
	// degenerate interval of 2 blocks - sort already done in memory, just persist the result
	blockFile.writeBlock(outerBounds.low, BufferHalf.LEFT);
	blockFile.writeBlock(outerBounds.high, BufferHalf.RIGHT);
	}
	} while (!openList.isEmpty());

	blockFile.close();
	}
	}

	enum BufferHalf {
	LEFT(0),
	RIGHT(1);

	int i;

	BufferHalf(int i) {
	this.i = i;
	}
	}

	class BlockFile extends RandomAccessFile {
	long nBlocks;
	private final int blockSize;
	private final byte[] buffer;

	// analysis stuff
	static int readCount = 0, writeCount = 0;

	public BlockFile(File file, String mode, int blockSize, byte[] buffer) throws FileNotFoundException {
	super(file, mode);
	this.blockSize = blockSize;
	this.buffer = buffer;
	nBlocks = (file.length() + blockSize - 1)/blockSize;
	}

	public void readBlock(long blockIndex, BufferHalf bufferHalf) throws IOException {
	seek(blockSize*blockIndex);
	read(buffer, blockSize*bufferHalf.i, blockSize);
	readCount++;
	}

	public void writeBlock(long blockIndex, BufferHalf bufferHalf) throws IOException {
	seek(blockSize*blockIndex);
	write(buffer, blockSize*bufferHalf.i, blockSize);
	writeCount++;
	}

	@Override
	public void close() throws IOException {
	super.close();

	System.out.println("N = " + nBlocks);
	System.out.println("Nlog(N) = " + nBlocksMath.log(nBlocks)/0.2);
	System.out.println("NN/2 = " + nBlocksnBlocks/2);
	System.out.println(readCount + " reads");
	System.out.println(writeCount + " writes");
	}
	}