-
-
Save JervenBolleman/22b85419b1a1e29800c8 to your computer and use it in GitHub Desktop.
Mapped file, same branchless gc counting
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package gc; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.nio.ByteBuffer; | |
import java.nio.MappedByteBuffer; | |
import java.nio.channels.FileChannel; | |
public class GC3 | |
{ | |
static final int GC = 0; | |
static final int AT = 1; | |
static final int N = 2; | |
static final int SECOND_RIGHTMOST_BITS_SET = 2; | |
static final int FOURTH_RIGHMOST_BIT_SET = 8; | |
public static void main(String[] args) | |
throws java.io.IOException | |
{ | |
long start = System.currentTimeMillis(); | |
final RandomAccessFile randomAccessFile = new RandomAccessFile("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa", | |
"r"); | |
final MappedByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0 | |
, randomAccessFile.length()); | |
int[] gcatn = readFastaAndCountGCATN(channel); | |
gcatn[AT] = gcatn[AT] - gcatn[N]; | |
System.out.printf("GC count %f\n", gcatn[GC] / (double) (gcatn[GC] + gcatn[AT])); | |
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start); | |
randomAccessFile.close(); | |
} | |
private static int[] readFastaAndCountGCATN(ByteBuffer channel) | |
throws IOException | |
{ | |
int[] gcatn = new int[3]; | |
boolean header = true; | |
for (int i = 0; i < channel.limit(); i++) | |
{ | |
char c = (char) channel.get(i); | |
header = readChar(gcatn, header, c); | |
} | |
return gcatn; | |
} | |
private static boolean readChar(int[] gcatn, boolean header, char c) | |
{ | |
//While branches are bad these are not to bad as both are very rare so there will be few branch predictions. | |
if (c == '>') | |
header = true; | |
else if (c == '\n') | |
header = false; //Fasta header are always one line. | |
else if (!header) | |
{ | |
//The idea here is to count without branching, so no stall and mispredictions. | |
//The trick depends on G and C in ascii/utf-8 encoding to share the second rightmost bit being set | |
//while A and T do not have that bit set. We then need to account for N as well. | |
//We push it into an array so that we can pass back all the values. | |
//Putting it all in a loop improves performance because the JIT is beter with on stack replacement like this | |
int lgc = ((c & SECOND_RIGHTMOST_BITS_SET) >> 1) - ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3); | |
gcatn[GC] = gcatn[GC] + lgc; | |
gcatn[AT] = gcatn[AT] + (((~lgc) & 1)); | |
gcatn[N] = gcatn[N] + ((c & FOURTH_RIGHMOST_BIT_SET) >>> 3); | |
} | |
return header; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment