Created
August 12, 2014 10:44
-
-
Save JervenBolleman/c8e0861438e72cb7ae15 to your computer and use it in GitHub Desktop.
GC6 threaded GC count using table (is faster than bit fiddling
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package gc; | |
import java.io.IOException; | |
import java.io.RandomAccessFile; | |
import java.nio.ByteBuffer; | |
import java.nio.MappedByteBuffer; | |
import java.nio.channels.FileChannel; | |
import java.util.ArrayList; | |
import java.util.Iterator; | |
import java.util.List; | |
import java.util.concurrent.Callable; | |
import java.util.concurrent.ExecutionException; | |
import java.util.concurrent.ExecutorService; | |
import java.util.concurrent.Executors; | |
import java.util.concurrent.Future; | |
public class GC6 | |
{ | |
static final int GC = 0; | |
static final int AT = 1; | |
static final int N = 2; | |
static final int SECOND_RIGHTMOST_BITS_SET = 2; | |
static final int FOURTH_RIGHMOST_BIT_SET = 8; | |
static final byte greaterthan = '>'; | |
static final byte newline = '\n'; | |
public static void main(String[] args) | |
throws java.io.IOException | |
{ | |
final RandomAccessFile randomAccessFile = new RandomAccessFile("Homo_sapiens.GRCh37.67.dna_rm.chromosome.Y.fa", | |
"r"); | |
final MappedByteBuffer channel = randomAccessFile.getChannel().map(FileChannel.MapMode.READ_ONLY, 0 | |
, randomAccessFile.length()); | |
new GC6(channel); | |
randomAccessFile.close(); | |
} | |
private GC6(ByteBuffer channel) throws IOException | |
{ | |
long start = System.currentTimeMillis(); | |
int gc = 0; | |
int at = 0; | |
final int threads = Runtime.getRuntime().availableProcessors(); | |
Counter next = null; | |
List<Future<int[]>> futures = new ArrayList<>(threads); | |
ExecutorService exec = Executors.newFixedThreadPool(threads); | |
for (int thread = threads; thread > 0; thread--) | |
{ | |
next = new Counter((channel.limit() / threads) * (thread - 1), next, channel); | |
futures.add(exec.submit(next)); | |
} | |
while (!futures.isEmpty()) | |
{ | |
Iterator<Future<int[]>> iter = futures.iterator(); | |
while (iter.hasNext()) | |
{ | |
try | |
{ | |
final int[] is = iter.next().get(); | |
if (is != null) | |
{ | |
gc += is['G'] + is['C']; | |
at += is['A'] + is['T']; | |
iter.remove(); | |
} | |
} catch (InterruptedException | ExecutionException e) | |
{ | |
// TODO Auto-generated catch block | |
e.printStackTrace(); | |
} | |
} | |
} | |
exec.shutdown(); | |
System.out.printf("GC6 count %f\n", gc / (double) (gc + (at))); | |
System.out.printf("Elapsed %d ms\n", System.currentTimeMillis() - start); | |
} | |
private static class Counter | |
implements Callable<int[]> | |
{ | |
private final int start; | |
private final ByteBuffer channel; | |
private final int finish; | |
public Counter(int starts, Counter next, ByteBuffer channel) | |
{ | |
super(); | |
this.channel = channel; | |
starts = findCountingStartingPoint(starts, channel); | |
this.start = starts; | |
if (next == null) | |
finish = channel.limit(); | |
else | |
finish = next.start - 1; | |
} | |
private int findCountingStartingPoint(int starts, ByteBuffer channel) | |
{ | |
for (int i = starts; i < channel.limit(); i++) | |
{ | |
//If we start on a new line we do not need to worry about missing counting things in a header | |
//We will either start with a '>' or on a line containing some DNA | |
if (channel.get(i) == newline) | |
{ | |
starts = i + 1;//Don't need to retest the newline later | |
break; | |
} | |
} | |
return starts; | |
} | |
@Override | |
public int[] call() | |
throws Exception | |
{ | |
int[] table = new int[85];//T (84) is the highest value if we get something higher we need to know! | |
boolean header = true; | |
for (int i = start; i < finish; i++) | |
{ | |
int b = channel.get(i); | |
//While branches are bad these are not to bad as both are very rare so there will be few branch predictions. | |
if (b == greaterthan) | |
header = true; | |
else if (header && b == newline) | |
header = false; //Fasta header are always one line. | |
else if (!header) | |
table[b]++; | |
} | |
return table; | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
You can replace the int[] table with char[] table, but the answer is incorrect on larger chromosomes.