Created
December 3, 2014 01:22
-
-
Save kate-crosby/fe7edb86c67fa9c6036b to your computer and use it in GitHub Desktop.
hmp.txt or hmp.txt.gz to plink ped format - use R gist to get map format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
import sys | |
import csv | |
import array | |
def main(): | |
#if len(sys.argv) < 2: | |
# print 'give me a filename' | |
# sys.exit(1) | |
fname = 'AmesUSInbreds_AllZeaGBSv1.0_imputed_20130508_chr1.hmp.txt' | |
names = [] | |
datarows = [] | |
with open(fname) as f: | |
header = f.readline().strip().split('\t') | |
header = header[:1] + header[11:] # strip out headers we don't need | |
for i, row in enumerate(f): | |
row = row.strip() | |
tabcount = j = 0 | |
for j in xrange(len(row)): | |
if row[j] == '\t': | |
if tabcount == 0: | |
name = row[:j] | |
elif tabcount == 10: | |
break | |
tabcount += 1 | |
datarow = array.array('b', row[j+1::2]) | |
names.append(name) | |
datarows.append(datarow) | |
print len(names), 'names' | |
print len(datarows), 'data' | |
print len(header), 'columns' | |
expanding = dict(N='N N', X='N N', A='A A', C='C C', G='G G', | |
T='T T', K='G T', M='A C', R='A G', S='C G', | |
W='A T', Y='C T') | |
def convert(c): | |
c = chr(c) | |
return expanding.get(c, c) | |
with open(fname + '.converted.ped', 'wb') as out: | |
owrite = out.write | |
for i in xrange(len(header)): | |
owrite(header[i] + '\t' + names[i] + '\t') | |
owrite('\t'.join(convert(r[i]) for r in datarows)) | |
owrite('\n') | |
print 'done' | |
if __name__ == '__main__': | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
A little gist to convert GBS hapmap.txt to plink ped files
Seems to work ok. No complaints - affiliated public gist will make the .map file