Created
February 15, 2020 14:14
-
-
Save kekcsi/e60b9b6030149b125c02d292bc632c18 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import java.io.File; | |
import java.io.FileNotFoundException; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.util.ArrayDeque; | |
import java.util.Arrays; | |
import java.util.Deque; | |
/** | |
* -- Big Q Sort -- | |
* | |
* A variation of Quicksort algorithm for in-place sorting of data that does not completely fit into RAM. | |
* | |
* The basic idea of this generalization is that pointers used for the partitioning are file block indexes. | |
* A step of the original Quicksort algorithm takes two records to compare and conditionally swaps them. | |
* This algorithm on the other hand, will take two blocks of records to fill a buffer in memory. | |
* Instead of swapping, it runs a sorting algorithm on the buffer and then blocks are written back to the file | |
* before the corresponding pointer is moved. | |
* | |
* Pointers used for the partitioning are file block indexes but pivot is still a single record like in Quicksort. | |
* | |
* Otherwise the program flow is similar to that of Quicksort. | |
* | |
* -- This Implementation -- | |
* | |
* Pivot selection saves block reads by selecting a record from the memory, which may not be an optimal pivot. | |
* If there are repeated keys, the actual record that plays the pivot role may change while partitioning in this | |
* implementation, it is only its key that needs to stay constant. | |
* | |
* In the demonstration, records are bytes of the file "sort_me.txt", key is the signed value, comparator is < | |
* | |
* The buffer is assumed to be smaller than the file. | |
* | |
* The output file will be a multiple of the block size, therefore program adds some null bytes and orders them. In a | |
* production implementation this would be bad; but there is no need to complicate this POC to prevent the issue. | |
*/ | |
public class BigQ { | |
private static final int BLOCK_SIZE = 11; | |
private static byte[] buffer = new byte[2*BLOCK_SIZE]; | |
private static final File SORT_ME = new File("sort_me.txt"); | |
static class BlockInterval { | |
long low; | |
long high; | |
public BlockInterval(long low, long high) { | |
assert low < high; | |
this.low = low; | |
this.high = high; | |
} | |
} | |
public static void main(String[] args) throws IOException { | |
BlockFile blockFile = new BlockFile(SORT_ME, "rws", BLOCK_SIZE, buffer); | |
// divide et impera starts with the whole file as interval to sort | |
Deque<BlockInterval> openList = new ArrayDeque<>(); | |
openList.push(new BlockInterval(0L, blockFile.nBlocks - 1)); | |
do { | |
BlockInterval outerBounds = openList.poll(); | |
// initial fill of buffer (both halves) | |
assert outerBounds.low < outerBounds.high; // assuming at least 2 blocks span | |
blockFile.readBlock(outerBounds.low, BufferHalf.LEFT); | |
blockFile.readBlock(outerBounds.high, BufferHalf.RIGHT); | |
Arrays.sort(buffer); | |
if (outerBounds.low + 1 < outerBounds.high) { | |
// interval that does not completely fit into one buffer - partitioning needed | |
long doneBelow = outerBounds.low; | |
long doneAbove = outerBounds.high; | |
// select pivot (does not affect correctness but efficiency) | |
int pivotIndex = BLOCK_SIZE - (int)System.currentTimeMillis()%2; | |
byte pivotKey = buffer[pivotIndex]; | |
partitioning: | |
while (true) { // buffer is sorted at this point | |
if (pivotIndex >= BLOCK_SIZE) { | |
// left half of buffer contains sub-pivot keys only, pivot is in the right half | |
blockFile.writeBlock(doneBelow, BufferHalf.LEFT); | |
doneBelow++; | |
if (doneBelow >= doneAbove) { | |
// done pointers collide, conclude partitioning by writing out the rest of records | |
blockFile.writeBlock(doneAbove, BufferHalf.RIGHT); | |
break partitioning; | |
} | |
blockFile.readBlock(doneBelow, BufferHalf.LEFT); | |
Arrays.sort(buffer); | |
// by restoring sortedness of the buffer, pivot may have moved some positions left | |
while (buffer[pivotIndex] > pivotKey) { | |
pivotIndex--; | |
} | |
} else { | |
// right half of buffer contains super-pivot keys only, pivot is in the left half | |
blockFile.writeBlock(doneAbove, BufferHalf.RIGHT); | |
doneAbove--; | |
if (doneAbove <= doneBelow) { | |
// done pointers collide, conclude partitioning by writing out the rest of records | |
blockFile.writeBlock(doneBelow, BufferHalf.LEFT); | |
break partitioning; | |
} | |
blockFile.readBlock(doneAbove, BufferHalf.RIGHT); | |
Arrays.sort(buffer); | |
// by restoring sortedness of the buffer, pivot may have moved some positions right | |
while (buffer[pivotIndex] < pivotKey) { | |
pivotIndex++; | |
} | |
} | |
} | |
// found the final block for the pivot (may still move in the block since the block contains 0 or more | |
// lower keys than the pivot, 1 or more pivot keys, and 0 or more higher keys) | |
assert doneBelow == doneAbove; | |
// when all records of a block are settled, exclude that block from further processing | |
if (doneBelow == outerBounds.high && buffer[BLOCK_SIZE] == pivotKey) { | |
doneBelow--; | |
} | |
if (doneAbove == outerBounds.low && buffer[BLOCK_SIZE - 1] == pivotKey) { | |
doneAbove++; | |
} | |
// keep partly processed and undone blocks in circulation | |
if (doneBelow > outerBounds.low) { | |
openList.push(new BlockInterval(outerBounds.low, doneBelow)); | |
} | |
if (doneAbove < outerBounds.high) { | |
openList.push(new BlockInterval(doneAbove, outerBounds.high)); | |
} | |
} else { | |
// degenerate interval of 2 blocks - sort already done in memory, just persist the result | |
blockFile.writeBlock(outerBounds.low, BufferHalf.LEFT); | |
blockFile.writeBlock(outerBounds.high, BufferHalf.RIGHT); | |
} | |
} while (!openList.isEmpty()); | |
blockFile.close(); | |
} | |
} | |
enum BufferHalf { | |
LEFT(0), | |
RIGHT(1); | |
int i; | |
BufferHalf(int i) { | |
this.i = i; | |
} | |
} | |
class BlockFile extends RandomAccessFile { | |
long nBlocks; | |
private final int blockSize; | |
private final byte[] buffer; | |
// analysis stuff | |
static int readCount = 0, writeCount = 0; | |
public BlockFile(File file, String mode, int blockSize, byte[] buffer) throws FileNotFoundException { | |
super(file, mode); | |
this.blockSize = blockSize; | |
this.buffer = buffer; | |
nBlocks = (file.length() + blockSize - 1)/blockSize; | |
} | |
public void readBlock(long blockIndex, BufferHalf bufferHalf) throws IOException { | |
seek(blockSize*blockIndex); | |
read(buffer, blockSize*bufferHalf.i, blockSize); | |
readCount++; | |
} | |
public void writeBlock(long blockIndex, BufferHalf bufferHalf) throws IOException { | |
seek(blockSize*blockIndex); | |
write(buffer, blockSize*bufferHalf.i, blockSize); | |
writeCount++; | |
} | |
@Override | |
public void close() throws IOException { | |
super.close(); | |
System.out.println("N = " + nBlocks); | |
System.out.println("N*log(N) = " + nBlocks*Math.log(nBlocks)/0.2); | |
System.out.println("N*N/2 = " + nBlocks*nBlocks/2); | |
System.out.println(readCount + " reads"); | |
System.out.println(writeCount + " writes"); | |
} | |
} |
Author
kekcsi
commented
Feb 15, 2020
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment