Alex Crits-Christoph alexcritschristoph

date	notes	Source	article
1/23/2020	China built a lab to study SARS and Ebola in Wuhan	Daily Mail	https://www.dailymail.co.uk/health/article-7922379/Chinas-lab-studying-SARS-Ebola-Wuhan-outbreaks-center.html
3/4/2020	Don’t buy China’s story: The coronavirus may have leaked from a lab	NY Post	https://nypost.com/2020/02/22/dont-buy-chinas-story-the-coronavirus-may-have-leaked-from-a-lab/
3/5/2020	Pompeo	France 24	https://www.france24.com/en/20200503-pompeo-says-enormous-evidence-coronavirus-originated-in-wuhan-lab
3/5/2020	Coronavirus Epidemic Draws Scrutiny to Labs Handling Deadly Pathogens	WSJ	https://www.wsj.com/articles/coronavirus-epidemic-draws-scrutiny-to-labs-handling-deadly-pathogens-11583349777
3/30/2020	Experts know the new coronavirus is not a bioweapon. They disagree on whether it could have leaked from a research lab	Bulletin of the Atomic Scientists	https://thebulletin.org/2020/03/experts-know-the-new-coronavirus-is-not-a-bioweapon-they-disagree-on-whether-it-could-have-leaked-from-a-researc

	# no fuss script todownload an ncbi genome in ~1 second:

	# usage download_ncbi.sh GCA_000330525.1

	genome_accession=$1

	datasets download genome accession ${genome_accession} --include gbff
	unzip ncbi_dataset.zip
	mv ncbi_dataset/data/${genome_accession}/genomic.gbff ${genome_accession}.gbff
	rm -rf ./ncbi_dataset*

	## returns the consensus sequence of a bam
	## minimum 3x depth of coverage at a site required
	import pysam
	import sys
	import pandas as pd
	import argparse
	from Bio import SeqIO
	import numpy as np
	from collections import defaultdict
	import pandas as pd

	import pysam
	bamfile = pysam.AlignmentFile('file.bam')

	for read in bamfile.fetch():
	number_of_mismatches = read.get_tag("NM")
	read_length = read.infer_query_length()
	read_percent_id = (1 - float(number_of_mismatches) / float(read_length)) * 100

	# if you want a specific read
	if read.query_name == 'my_read':

	import pysam
	import sys
	import pandas as pd
	import argparse
	from Bio import SeqIO
	import numpy as np
	from collections import defaultdict
	import pandas as pd

	P2C = {'A':0, 'C':1, 'T':2, 'G':3}

	import glob
	from Bio import SeqIO
	for fn in glob.glob('./antiSMASH//cluster*.gbk'):
	genome = fn.split("/")[-2]
	cluster_num = fn.split("cluster")[1].split(".")[0]
	i = 0
	for record in SeqIO.parse(fn, 'genbank'):
	for feature in record.features:
	if feature.type == 'CDS':
	print(">" + genome + "\|" + cluster_num + "\|" + str(i) + "\|" + str(feature.location.start) + ":" + str(feature.location.end))

	## DESeq2 made as easy as it should have always been, but for some reason isn't.
	## Code based on: https://gist.github.com/stephenturner/f60c1934405c127f09a6

	library('DESeq2')

	'''
	Our starting CSV/TSV looks like:
	Sample Pairing1 Pairing2 Pairing3 Status KXB65094.1 KXB65950.1 KXB67202.1 ....
	1 a Active Active Positive Active 0 0 1
	2 b Active Active Positive Active 0 1 1

	from Bio import SeqIO
	from Bio.SeqRecord import SeqRecord
	from random import randint
	from Bio.SeqUtils import GC
	import sys
	#There should be one and only one record, the entire genome:
	print "reading"
	mito_record = SeqIO.read(open(sys.argv[1]), "fasta")

	gcs = []

	# Import the random forest package
	from sklearn.ensemble import RandomForestClassifier
	from sklearn import cross_validation
	import numpy as np
	dataset = np.loadtxt('training_data.csv', delimiter=",")

	# Create the random forest object which will include all the parameters
	# for the fit
	forest = RandomForestClassifier(n_estimators = 100)

	import os
	from Bio import SeqIO
	from Bio.SeqRecord import SeqRecord
	from random import randint

	j = 1
	import sys
	for root, subdirs, files in os.walk(sys.argv[1]):
	for f in files:
	seq = os.path.join(root,f)