JervenBolleman · August 25, 2014 12:27
diff --git a/GC7.java b/GC7.java
 package gc;

 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.channels.FileChannel;


 public class GC7
 {
 	private static final long GorC = 0b01100110_01100110_01100110_01100110_01100110_01100110_01100110_01100110L;
 	private static final long AorT = ~GorC;

 	public static void main(String[] args)
 	    throws java.io.IOException
 	{

 		final String orig = "Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa";
 		final String re = orig+".re";
 		rewrite(orig, re);
 		final RandomAccessFile randomAccessFile = new RandomAccessFile(re,
 		    "r");
 		final ByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
 		    , randomAccessFile.length());
 		new GC7(channel);
 		randomAccessFile.close();
 	}

 	/**
 	 * This code rewrites the nucleotide fasta file in a way that allows later faster parsing 
 	 * and working with it. Using 4 bytes per nucleotide it is not as space efficient as possible.
 	 * But the nice thing is that with a single binary & and a Long.bitCount() (popcount) we can count
 	 * 16 nucleotides per instruction. 
 	 * @param string
 	 * @param reName
 	 * @throws IOException
 	 */
 	private static void rewrite(String string, String reName)
 	    throws IOException
 	{
 		final File orig = new File(string);
 		final File re = new File(reName);
 		if (!re.exists())
 		{
 			final FileOutputStream fileWriter = new FileOutputStream(re);
 			int bufSize = 0;
 			StringBuilder header = new StringBuilder();
 			//We want to write bytes not chars. Having the first bit set in a UTF-8 string
 			//means we get two bytes output due to encoding expansion.
 			//This byte array means we keep on working on bytes, avoiding encoding crazyness.
 			ByteArrayOutputStream record = new ByteArrayOutputStream();
 			char prev = 'N';
 			boolean even = false;
 			try (InputStream in = new FileInputStream(orig))
 			{
 				byte[] read = new byte[4096];
 				while ((bufSize = in.read(read)) != -1)
 				{
 					for (int i = 0; i < bufSize; i++)
 					{
 						char c = (char) read[i];
 						if (c == '>')
 						{
 							if (record != null)
 							{
 								fileWriter.write(String.valueOf(record.size()).getBytes());
 								fileWriter.write(record.toByteArray());
 							}
 							header = new StringBuilder();
 						}
 						else if (c == '\n')
 						{
 							if (header != null)
 							{
 								fileWriter.write(String.valueOf(header.length()).getBytes());
 								fileWriter.write('>');
 								fileWriter.write(header.toString().getBytes());
 							}
 							header = null; //Fasta header are always one line.
 						}
 						else if (header != null)
 							header.append(c);
 						else if (header == null)
 						{
 							if (even)
 							{
 								final byte i2 = (byte) ((convert(prev) << 4) | convert(c));
 								record.write(i2);
 								even = false;
 							}
 							else
 							{
 								prev = c;
 								even = true;
 							}
 						}
 					}
 				}
 			}
 			if (!even)
 				record.write((convert(prev) << 4));
 			if (record != null)
 			{
 				for (int a=0;a < record.size() % 16;a+=2)
 				{
 					record.write((byte) 0);
 				}
 				fileWriter.write(String.valueOf(record.size() * 2).getBytes());
 				//The semi colon is the delimiter between the header and the sequence.
 				fileWriter.write('!');
 				fileWriter.write(record.toByteArray());
 			}
 			fileWriter.flush();
 			fileWriter.close();
 		}
 	}

 	private static byte convert(char c)
 	{
 		switch (c)
 		{
 			case 'A':
 				return 0b0001;
 			case 'C':
 				return 0b0010;
 			case 'G':
 				return 0b0100;
 			case 'T':
 				return 0b1000;
 			default://N is encoded as all zero making that easy to count as well if needed.
 				return 0b0000;
 		}
 	}

 	private GC7(ByteBuffer channel) throws IOException
 	{
 		long start = System.currentTimeMillis();
 		int at = 0, gc = 0;
 		StringBuilder digit = new StringBuilder();
 		for (int i = 0; i < channel.limit(); i++)
 		{
 			int c = channel.get();
 			if (Character.isDigit(c))
 			{
 				digit.append((char) c);
 			}
 			else if (c == '>')
 			{
 				final int skip = Integer.parseInt(digit.toString());
 				for (int a = 0; a < skip; a++)
 				{
 					channel.get();
 				}
 				i += skip;
 				digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
 			}
 			else if (c == '!')
 			{
 				long numberOfChars = Long.parseLong(digit.toString());
 				digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
 				for (long j = 0; j < numberOfChars; j+=16)
 				{
 						long l = channel.getLong();
 						gc += Long.bitCount(l & GorC);
 						at += Long.bitCount(l & AorT);
 				}
 				i += numberOfChars;
 				for (; i < channel.limit(); i++)
 				{
 					c = channel.get();
 					gc += Long.bitCount(c & GorC);
 					at += Long.bitCount(c & AorT);
 				}
 			}
 		}
 		System.out.println("GC7 count " + gc + " gc at " + at);
 		System.out.printf("GC7 count %f\n", gc / (double) (gc + at));
 		System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
 	}
 }
	package gc;

	import java.io.ByteArrayOutputStream;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.RandomAccessFile;
	import java.nio.ByteBuffer;
	import java.nio.channels.FileChannel;


	public class GC7
	{
	private static final long GorC = 0b01100110_01100110_01100110_01100110_01100110_01100110_01100110_01100110L;
	private static final long AorT = ~GorC;

	public static void main(String[] args)
	throws java.io.IOException
	{

	final String orig = "Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa";
	final String re = orig+".re";
	rewrite(orig, re);
	final RandomAccessFile randomAccessFile = new RandomAccessFile(re,
	"r");
	final ByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0
	, randomAccessFile.length());
	new GC7(channel);
	randomAccessFile.close();
	}

	/**
	* This code rewrites the nucleotide fasta file in a way that allows later faster parsing
	* and working with it. Using 4 bytes per nucleotide it is not as space efficient as possible.
	* But the nice thing is that with a single binary & and a Long.bitCount() (popcount) we can count
	* 16 nucleotides per instruction.
	* @param string
	* @param reName
	* @throws IOException
	*/
	private static void rewrite(String string, String reName)
	throws IOException
	{
	final File orig = new File(string);
	final File re = new File(reName);
	if (!re.exists())
	{
	final FileOutputStream fileWriter = new FileOutputStream(re);
	int bufSize = 0;
	StringBuilder header = new StringBuilder();
	//We want to write bytes not chars. Having the first bit set in a UTF-8 string
	//means we get two bytes output due to encoding expansion.
	//This byte array means we keep on working on bytes, avoiding encoding crazyness.
	ByteArrayOutputStream record = new ByteArrayOutputStream();
	char prev = 'N';
	boolean even = false;
	try (InputStream in = new FileInputStream(orig))
	{
	byte[] read = new byte[4096];
	while ((bufSize = in.read(read)) != -1)
	{
	for (int i = 0; i < bufSize; i++)
	{
	char c = (char) read[i];
	if (c == '>')
	{
	if (record != null)
	{
	fileWriter.write(String.valueOf(record.size()).getBytes());
	fileWriter.write(record.toByteArray());
	}
	header = new StringBuilder();
	}
	else if (c == '\n')
	{
	if (header != null)
	{
	fileWriter.write(String.valueOf(header.length()).getBytes());
	fileWriter.write('>');
	fileWriter.write(header.toString().getBytes());
	}
	header = null; //Fasta header are always one line.
	}
	else if (header != null)
	header.append(c);
	else if (header == null)
	{
	if (even)
	{
	final byte i2 = (byte) ((convert(prev) << 4) \| convert(c));
	record.write(i2);
	even = false;
	}
	else
	{
	prev = c;
	even = true;
	}
	}
	}
	}
	}
	if (!even)
	record.write((convert(prev) << 4));
	if (record != null)
	{
	for (int a=0;a < record.size() % 16;a+=2)
	{
	record.write((byte) 0);
	}
	fileWriter.write(String.valueOf(record.size() * 2).getBytes());
	//The semi colon is the delimiter between the header and the sequence.
	fileWriter.write('!');
	fileWriter.write(record.toByteArray());
	}
	fileWriter.flush();
	fileWriter.close();
	}
	}

	private static byte convert(char c)
	{
	switch (c)
	{
	case 'A':
	return 0b0001;
	case 'C':
	return 0b0010;
	case 'G':
	return 0b0100;
	case 'T':
	return 0b1000;
	default://N is encoded as all zero making that easy to count as well if needed.
	return 0b0000;
	}
	}

	private GC7(ByteBuffer channel) throws IOException
	{
	long start = System.currentTimeMillis();
	int at = 0, gc = 0;
	StringBuilder digit = new StringBuilder();
	for (int i = 0; i < channel.limit(); i++)
	{
	int c = channel.get();
	if (Character.isDigit(c))
	{
	digit.append((char) c);
	}
	else if (c == '>')
	{
	final int skip = Integer.parseInt(digit.toString());
	for (int a = 0; a < skip; a++)
	{
	channel.get();
	}
	i += skip;
	digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
	}
	else if (c == '!')
	{
	long numberOfChars = Long.parseLong(digit.toString());
	digit = new StringBuilder(); //reset the collected digits, in case there is more than one header.
	for (long j = 0; j < numberOfChars; j+=16)
	{
	long l = channel.getLong();
	gc += Long.bitCount(l & GorC);
	at += Long.bitCount(l & AorT);
	}
	i += numberOfChars;
	for (; i < channel.limit(); i++)
	{
	c = channel.get();
	gc += Long.bitCount(c & GorC);
	at += Long.bitCount(c & AorT);
	}
	}
	}
	System.out.println("GC7 count " + gc + " gc at " + at);
	System.out.printf("GC7 count %f\n", gc / (double) (gc + at));
	System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start);
	}
	}