zychen2016

Introduction to linux command line/ipython/git/R for bioinformatics:

	#' Convert counts to transcripts per million (TPM).
	#'
	#' Convert a numeric matrix of features (rows) and conditions (columns) with
	#' raw feature counts to transcripts per million.
	#'
	#' Lior Pachter. Models for transcript quantification from RNA-Seq.
	#' arXiv:1104.3889v2
	#'
	#' Wagner, et al. Measurement of mRNA abundance using RNA-seq data:
	#' RPKM measure is inconsistent among samples. Theory Biosci. 24 July 2012.

	## RNA-seq analysis with DESeq2
	## Stephen Turner, @genetics_blog

	# RNA-seq data from GSE52202
	# http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=gse52202. All patients with
	# ALS, 4 with C9 expansion ("exp"), 4 controls without expansion ("ctl")

	# Import & pre-process ----------------------------------------------------

	# Import data from featureCounts

	import argparse
	import textwrap
	import os
	import sys
	from datetime import timedelta, datetime


	# function for reading a multifasta file
	# returns a dictionary with sequence headers and nucleotide sequences
	def get_seqs_from_fasta(filepath):

	# Script to compare Reads per Kilobase per Million mapped reads (RPKM) to Transcripts per Million (TPM) for gene expression count data
	# Wagner et al. 2012 "Measurement of mRNA abundance using RNA-seq data: RPKM measure
	# is inconsistent among samples" Theory Biosci. 131:281-285

	library(plyr)


	## Worked example from http://blog.nextgenetics.net/?e=51

	X <- data.frame(gene=c("A","B","C","D","E"), count=c(80, 10, 6, 3, 1),


	aln_snps = {}
	for aln in aln_files:
	recs = [f for f in SeqIO.parse(aln, 'fasta')]
	# strain names should be the last dash delimited element in fasta header
	strains = [rec.name.split('-')[-1] for rec in recs]
	# get a dictionary of strain names and sequences
	strain_seq = {rec.name.split('-')[-1]:''.join([nt for nt in rec.seq]) \
	for rec in recs}
	# get length of the MSA and check that all of the seq are the same length

	setGeneric("calcFPKMs", function(counts, ...) {standardGeneric("calcFPKMs")})

	setMethod("calcFPKMs", c("GRanges"),
	function(counts, verbose = TRUE)
	{
	counts.df <- as.data.frame(counts)
	counts.cols <- metadata(counts)[["counts.cols"]] + 5

	# Only use read counts from the known transcriptome.
	counts.df <- counts.df[counts.df[, "type"] %in% c("exon", "junction"), ]