Created
August 25, 2014 12:27
-
-
Save JervenBolleman/d1430d0549028861504c to your computer and use it in GitHub Desktop.
GC7 GC count using binary encoded nucleotides for speed (requires converting a fasta file first)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package gc; | |
import java.io.ByteArrayOutputStream; | |
import java.io.File; | |
import java.io.FileInputStream; | |
import java.io.FileOutputStream; | |
import java.io.IOException; | |
import java.io.InputStream; | |
import java.io.RandomAccessFile; | |
import java.nio.ByteBuffer; | |
import java.nio.channels.FileChannel; | |
public class GC7 | |
{ | |
private static final long GorC = 0b01100110_01100110_01100110_01100110_01100110_01100110_01100110_01100110L; | |
private static final long AorT = ~GorC; | |
public static void main(String[] args) | |
throws java.io.IOException | |
{ | |
final String orig = "Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa"; | |
final String re = orig+".re"; | |
rewrite(orig, re); | |
final RandomAccessFile randomAccessFile = new RandomAccessFile(re, | |
"r"); | |
final ByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0 | |
, randomAccessFile.length()); | |
new GC7(channel); | |
randomAccessFile.close(); | |
} | |
/** | |
* This code rewrites the nucleotide fasta file in a way that allows later faster parsing | |
* and working with it. Using 4 bytes per nucleotide it is not as space efficient as possible. | |
* But the nice thing is that with a single binary & and a Long.bitCount() (popcount) we can count | |
* 16 nucleotides per instruction. | |
* @param string | |
* @param reName | |
* @throws IOException | |
*/ | |
private static void rewrite(String string, String reName) | |
throws IOException | |
{ | |
final File orig = new File(string); | |
final File re = new File(reName); | |
if (!re.exists()) | |
{ | |
final FileOutputStream fileWriter = new FileOutputStream(re); | |
int bufSize = 0; | |
StringBuilder header = new StringBuilder(); | |
//We want to write bytes not chars. Having the first bit set in a UTF-8 string | |
//means we get two bytes output due to encoding expansion. | |
//This byte array means we keep on working on bytes, avoiding encoding crazyness. | |
ByteArrayOutputStream record = new ByteArrayOutputStream(); | |
char prev = 'N'; | |
boolean even = false; | |
try (InputStream in = new FileInputStream(orig)) | |
{ | |
byte[] read = new byte[4096]; | |
while ((bufSize = in.read(read)) != -1) | |
{ | |
for (int i = 0; i < bufSize; i++) | |
{ | |
char c = (char) read[i]; | |
if (c == '>') | |
{ | |
if (record != null) | |
{ | |
fileWriter.write(String.valueOf(record.size()).getBytes()); | |
fileWriter.write(record.toByteArray()); | |
} | |
header = new StringBuilder(); | |
} | |
else if (c == '\n') | |
{ | |
if (header != null) | |
{ | |
fileWriter.write(String.valueOf(header.length()).getBytes()); | |
fileWriter.write('>'); | |
fileWriter.write(header.toString().getBytes()); | |
} | |
header = null; //Fasta header are always one line. | |
} | |
else if (header != null) | |
header.append(c); | |
else if (header == null) | |
{ | |
if (even) | |
{ | |
final byte i2 = (byte) ((convert(prev) << 4) | convert(c)); | |
record.write(i2); | |
even = false; | |
} | |
else | |
{ | |
prev = c; | |
even = true; | |
} | |
} | |
} | |
} | |
} | |
if (!even) | |
record.write((convert(prev) << 4)); | |
if (record != null) | |
{ | |
for (int a=0;a < record.size() % 16;a+=2) | |
{ | |
record.write((byte) 0); | |
} | |
fileWriter.write(String.valueOf(record.size() * 2).getBytes()); | |
//The semi colon is the delimiter between the header and the sequence. | |
fileWriter.write('!'); | |
fileWriter.write(record.toByteArray()); | |
} | |
fileWriter.flush(); | |
fileWriter.close(); | |
} | |
} | |
private static byte convert(char c) | |
{ | |
switch (c) | |
{ | |
case 'A': | |
return 0b0001; | |
case 'C': | |
return 0b0010; | |
case 'G': | |
return 0b0100; | |
case 'T': | |
return 0b1000; | |
default://N is encoded as all zero making that easy to count as well if needed. | |
return 0b0000; | |
} | |
} | |
private GC7(ByteBuffer channel) throws IOException | |
{ | |
long start = System.currentTimeMillis(); | |
int at = 0, gc = 0; | |
StringBuilder digit = new StringBuilder(); | |
for (int i = 0; i < channel.limit(); i++) | |
{ | |
int c = channel.get(); | |
if (Character.isDigit(c)) | |
{ | |
digit.append((char) c); | |
} | |
else if (c == '>') | |
{ | |
final int skip = Integer.parseInt(digit.toString()); | |
for (int a = 0; a < skip; a++) | |
{ | |
channel.get(); | |
} | |
i += skip; | |
digit = new StringBuilder(); //reset the collected digits, in case there is more than one header. | |
} | |
else if (c == '!') | |
{ | |
long numberOfChars = Long.parseLong(digit.toString()); | |
digit = new StringBuilder(); //reset the collected digits, in case there is more than one header. | |
for (long j = 0; j < numberOfChars; j+=16) | |
{ | |
long l = channel.getLong(); | |
gc += Long.bitCount(l & GorC); | |
at += Long.bitCount(l & AorT); | |
} | |
i += numberOfChars; | |
for (; i < channel.limit(); i++) | |
{ | |
c = channel.get(); | |
gc += Long.bitCount(c & GorC); | |
at += Long.bitCount(c & AorT); | |
} | |
} | |
} | |
System.out.println("GC7 count " + gc + " gc at " + at); | |
System.out.printf("GC7 count %f\n", gc / (double) (gc + at)); | |
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment