google n-gram
donot do
python
GATK java
recommand book An Introduction to Statistical Learning with Applications in R
CTAN CPAN CRAN Biocondutor
yum install -y epel-release
yum install R
download https://download1.rstudio.org/rstudio-1.0.44-x86_64.rpm
inHybrid | Hybrid | compiled |
---|---|---|
Python | Java | C |
R | Perl | C++ |
Bash |
su -c "yum install rstudio-1.0.44-x86_64.rpm"
#!/usr/bin/Rscript
cat("hello world\n")
export PATH=$PATH=~/ourbin
getwd()
setwd("/home/cb2user/Download")
a<-2
a + 2
b <- 3
myfunction1<-function(x){
cat(x)
}
x<-"Hello World\n"
myfunction1(x)
x=100
myfunction1<-function(x){
v=rnorm(x)
mean(v)
}
myfunction1(x)
x=100
myfunction=function(x){
v<<-rnorm(x)
mean(v)
}
myfunction(x)
v
http://stackoverflow.com/questions/10904124/global-and-local-variables-in-r
myfunction2<-function(y){
cat(y)
}
rseek.org
killall rstudio
install.packages("ggplot2")
vector()
list()
x=1:20
x=5:50
class(x)
x[1]
y=c("a","b","c")
class(y)
p=c("1","2","3")
class(p)
z=c(1,2,3)
class(z)
y[1]
list()
l=c(1,"a",2,"b")
c() # Combine Values into a Vector or List
l=list(1,"a",2,"b")
y[3]="d"
l[[3]]=function(){cat("hello")}
l[[3]]()
l=list()
for(i in 1:10){
cat(i,"\n")
l[[i]]=i
}
for(i in 1:length(l)){
cat(l[[i]],"\n")
}
words=c("hello","world")
for(i in words){
cat(i," ")
}
words=c("Hello","and hi","World")
l=list()
for(i in 1:length(words)){
if(i==3 && (words[i]=="World" || words[i]=="world")){
l[[i]]="America!"
}else{
l[[i]]=words[i]
}
}
for(i in l){
cat(i," ")
}
8bit, 16bit, linked list
class(v)
v=list("a"="A","b"="B","c"=c(1,2,3))
typeof(v)
t=TRUE
f=FALSE
n=c(1:10)
n[n>5|n<2]
n=1:20
n[n>=10& n<=15]
n[n!=5 & n!=6]
n[n!=c(5,6)]
n[!(n%in%c(5,6))]
f=factor(c("yes","yes","no"))
table(f)
as.character(n)
m=matrix(n,nrow=2)
matrix(1:20,byrow = T,nrow=2)
n=1:10
dim(n)=c(2,5)
m[1,]
m[,2]
m[,-4]
http://www.programiz.com/r-programming/precedence-associativity
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Logic.html
https://stat.ethz.ch/R-manual/R-devel/library/base/html/Syntax.html
data.frame data.table
dplyr
d=data.frame("X"=c(1,3,3),Y=c(5,6,7))
d[,1]
d$X
d=data.frame("X"=c(1,3,3),Y=c(5,6,7),"Z"=c(T,T,F))
data(iris)
View(iris)
dim(iris)
write.table(iris,"/home/cb2user/iris.txt",quote=F,row.names=F)
cat iris.txt|column -t
myiris=read.table("iris.txt",header=T)
gzip iris.txt
zmyiris=read.table("iris.txt.gz",header=T,na.strings="-")
test=c(1,2,NA,3)
sum(test)
is.na(test)
test=test[!is.na(test)]
sum(test)
NA, NaN
\n \d \s \S \d+ plus means one or more (.*)
^>gi|(\d+)|ref|(\S+)| $
fh=file("/media/sf_CB2/dm/dm.faa",open="r")
while (length (line=readLines(fh,warn=F,n=1))>0){
pattern="^\\>gi\\|(\\d+)\\|ref\\|(\\S+)\\|"
m=regexec(pattern,line,perl=T)
v=regmatches(line,m)
vec=v[[1]]
cat(vec[2],vec[3],sep="\t")
}
close(fh)
str(v)
fh=file("/media/sf_CB2/dm/dm.faa",open="r")
while (length (line=readLines(fh,warn=F,n=1))>0){
pattern="^\\>gi\\|(\\d+)\\|ref\\|(\\S+)\\|"
#m=regexec(pattern,line,perl=T)
#m=mymatch(pattern,line)
#if(m[[1]][1]==-1){
# next
#}
#if(!(ifmatched(m))){
# next
#}
m=mymatch(commapattern,line)
if(!ifmatched(m)){
my=mymatch(pattern,line)
}
if(!(ifmatched(m))){
next
}
cat(getstring(line,m,2))
cat("\n")
#v=regmatches(line,m)
#vec=v[[1]]
#cat(vec[2],vec[3],sep="\t")
#cat("\n")
}
close(fh)
pattern="\\|\\s+(.\*)\\s+\\["
commapattern="\\|\\s+(.\*)\\,\\s+\\["
getstring=function(text,match,index){
v=regmatches(text,match)
vec=v[[1]]
return(vex[[index]])
}
mymatch=function(pattern,text){
m=regexec(patter,text,perl=T)
return(m)
}
ifmatched=function(m){
if(m[[1]][1]==-1){
return(FALSE)
}else{
return(TRUE)
}
}
v=rnorm(100)
median(v)
mean(v)
hist(v)
normal distribution, p-value CVS SVN GIT
su -c "yum install git"
git clone https://github.com/cb2edu/CB2-101-BioComp.git
mkdir my_git_project
cd my_git_project
git init
echo "hello" > hello.txt
git add hello.txt
git status
git config --global user.email "[email protected]"
git config --global user.name "username"
git commit -m "Added new file Hello.txt"
git log
md5sum hello.txt > md5sum.txt
git add hello.txt
git add md5sum.txt
git status
git log -a
git checkout 2c2f1
n=1:10
for(i in n){
cat(i)
cat("\n")
}
if (!length(line)>0){
next
}
data("airquality")
mean(airquality$Ozone)
mean(airquality$Ozone[is.na(airquality$Ozone)])
d=airquality$Ozone
index=is.na(airquality$Ozone)
clean_idx=!index
clean_value=d[clean_index]
mean(clean_value)
mean(airquality$Ozone,na.rm=T)
summary(airquality)
normal distribution always mean=median
plot(airquality)
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")
plot(airquality$Temp,airquality$Ozone,type="n",xlab="",ylab="",axes=F)
points(airquality$Temp,airquality$Ozone,pch=16)
axis(1)
axis(2)
box()
title(main="Temp vs Ozone",xlab="Temp",ylab="Ozone")
legend("topleft",legend="Some points",pch=16)
par(mfrow=c(1,2))
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")
#plot(airquality$Temp,airquality$Ozone,type="n",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="o",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="l",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="p",xlab="",ylab="",axes=F)
#plot(airquality$Temp,airquality$Ozone,type="p",xlab="",ylab="",axes=T)
difference between vector and bitmap eps ps pdf svg
pdf("test.pdf",paper="letter")
plot(airquality$Temp,airquality$Ozone, xlab="Temp", ylab="", main="Temp vs Ozone")
dev.off()
png()
svg()
su -c "yum install evince"
hist(airquality$Ozone)
hist(airquality$Ozone, breaks=50)
boxplot(airquality)
MAD Outliers: http://www.stat.wmich.edu/s160/book/node8.html
data("mtcars")
View(mtcars)
counts=table(mtcars$gear)
barplot(counts,xlab="Gears",ylab="Freq",main="Grears",horiz=T)
counts=table(mtcars$cyl,mtcars$gear)
barplot(counts,beside=T,legend=rownames(counts))
rownames(counts)
rnorm(10)
rnorm(10,sd=1,mean=5)
data=list(a=rnorm(10,sd=1,mean=5),b=rnorm(10,sd=1,mean=5),c=rnorm(10,sd=1,mean=5))
data=list(rnorm(10,sd=1,mean=5),rnorm(10,sd=1,mean=5),rnorm(10,sd=1,mean=5))
stds=c()
means=c()
sd(data[[1]])
for(i in 1:3){
data[[i]]=rnorm(10,mean=5,sd=1)
stds[i]=sd(data[[i]])
means[i]=mean(data[[i]])
}
bp=barplot(means,names=c(1:3),ylim=c(0,ceiling(max(means+stds))),col=nice) # median points
arrows(x0=bp,y0=means-stds,x1=bp,y1=means+stds, code=3,angle=90)
install.packages("RColorBrewer")
library("RColorBrewer")
display.brewer.all()
nice=brewer.pal(3,"Pastel2")
https://en.wikipedia.org/wiki/Johannes_Gutenberg
https://en.wikipedia.org/wiki/Hermann_Zapf
https://en.wikipedia.org/wiki/Donald_Knuth
https://en.wikipedia.org/wiki/Leslie_Lamport
pdfTeX https://en.wikipedia.org/wiki/PdfTeX
XeTeX https://tug.org/interviews/kew.html
pandoc markdown
\documentclass{article} \title{My first document in \LaTeX} \author{Malay}
\begin{document} \maketitle \section{First section}
\subsection{A subsection} blablabla
\section{Second section} Second document
Math
\end{document}
su -c "yum install pandoc"
this is first section.
This is second section
blablablablablabla
pandoc --number-sections -o hello_markdown.pdf hello.md
pandoc -o hello_markdown.docx hello.md
Zotero
https://guides.github.com/features/mastering-markdown/
su -c "yum install texlive-framed" su -c "yum install texlive-titling"
regression gradient descent
data(women)
head(women)
dim(women)
plot(women)
m=lm(women$weight~women$height)
#regression(prediction,dependence of x y) and correlation(翻转xy不变) difference
s=summary(m)
abline(v=65,col="Blue")
abline(h=140,col="Green")
abline(m,col="red")
s$adj.r.squared
s$coefficients
s$coefficients[2,1]
s$coefficients[1,1]
d=data.frame()
data("airquality")
ncol=dim(airquality)[2]
names=colnames(airquality)
for(i in 1:ncol){
for(j in 1:ncol){
if(i==j){
next
}
x=airquality[,i]
y=airquality[,j]
m=lm(y~x)
s=summary(m)
r.sq=s$adj.r.squared
d=rbind(d,data.frame(names[i],names[j],r.sq))
cat(names[i],names[j],r.sq,sep="\t")
cat("\n")
}
}
o=order(d$r.sq,decreasing=T)
d=d[o,]
lm(airquality$Ozone~airquality$Temp)
# http://stackoverflow.com/questions/10300769/how-to-load-packages-in-r-automatically
library(pheatmap)
index=complete.cases(airquality)
c=airquality[index,]
C=cor(c)
pheatmap(C)
ct=cor.text(airquality$Ozone,airquality$Temp)
wget ftp://ftp.uniprot.org/pub/databases/uniprot/knowledgebase/uniprot_sprot.fasta.gz
homoplasy homology
entroy KL divergence Bayes chain Markov HMM
hmmer
wget ftp://ftp.ncbi.nlm.nih.gov/blast/executables/legacy/2.2.26/blast-2.2.26-x64-linux.tar.gz
tar and zip
tar cvzf cvJf cvjf
c for creation x for unzip
tar zxvf blast-2.2.26*
export PATH=$PATH:pwd
http://www.uniprot.org/uniprot/P04637.fasta
wget -O p53.fas http://www.uniprot.org/uniprot/P04637.fasta
formatdb -i uniprot_sprot.fasta -n uniprot
blastall -pblastp -i p53.fas -d uniprot -o p53.blastp.out -m 9
e-value is based on the database size.
wget ftp://ftp.ncbi.nlm.nih.gov/pub/HomoloGene/current/homologene.data
d=read.table("/media/sf_CB2/hmm/homologene.data",sep="\t") cluster=d[d$V1==460,] write.table(cluster,"p53_homologene.txt",sep="\t",quote=F,row.names=F)
\n break
REST API
#!/bin/bash
file=$1
for i in cat $file | cut -f 6
do
wget -q -O - "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=$i&rettype=fasta&retmode=text"
done
./download_Seq.sh p53_homologene.txt > p53.fa
evince & # program running in the background
fg bring the software to the frontground
muscle
export PATH=$PATH:pwd
muscle -in p53.fa -out p53.aln
pushd popd
hmmbuikld --informat afa p53.hmm p53.aln
ln -s /media/sf_CB2/uniprot/uniprot_sprot.fasta .
hmmsearch -o hits.txt /media/sf_CB2/hmm/p53.hmm uniprot_sprot.fasta
su -c "yum install libxml2 libxml2-devel libcurl libcurl-devel"
source("https://bioconductor.org/biocLite.R")
biocLite("Homo.sapiens")
library("Homo.sapiens")
columns(Homo.sapiens)
keytypes(Homo.sapiens)
genename=keys(Homo.sapiens,keytype="SYMBOL")
gene.list=select(Homo.sapiens,keys=genenames,keytype="SYMBOL",columns=c("TXCHROM"))
gene.freq=table(gene.df$TXCHROM)
barplot(gene.freq,las=2)
library(GenomicFeatures)
chr.info=getChromInFromUCSC("hg19")
gene.freq=data.frame(gene.freq)
names(gene.freq)=c("chr","freq")
merged.data=merge(gene.freq,chr.info,by.x="chr",by.y="chrom")
plot(merged.data$length,merged.data$freq,main="Lenth vs gene count",xlab="Chromosome length",ylab="Gene count")
text(merged.data$length,merged.data$freq,cex=0.6,pos=4,col="red")
m=lm(merged.data$freq~merged.data$length)
summary(m)
abline(m)
cor.test(merged.data$length,merged.data$freq)
#!/usr/bin/Rscript
mymatch <- function (pattern, text) {
m <- regexec( pattern, text, perl = TRUE)
return(m)
}
ifmatched <- function (m) {
if (m[[1]][1] == -1) {
return(FALSE)
}else {
return(TRUE)
}
}
getstring <- function (text, match, index) {
v <- regmatches(text, match)
vec <- v[[1]]
return(vec[index])
}
###### MAIN ENTRY #############
cmd=commandArgs(trailingOnly=TRUE)
id=cmd[1]
filename=cmd[2]
fh <- file(filename,open="r")
inside=FALSE
while (length(line <- readLines(fh,warn=FALSE, n=1))>0){
idpattern="\\>(\\S+)\\s+"
#commapattern <- "\\|\\s+(.*)\\,\\s+"
#pattern <- "\\|\\s+(.*)\\s+\\["
m <- mymatch (idpattern, line)
if (!(ifmatched(m))) {
#next
if(inside){
seq=paste(seq,line)
}else{
next
}
}else{
#cat(getstring(line, m, 2))
fileis=getstring(line,m,2)
if(fileid==id){
seq=paste(seq,line)
inside=TRUE
}else{
if(inside){
cat(seq)
break
}
}
}
#cat("\n")
}
close(fh)
cat(seq)
#################
#!/usr/bin/Rscript
mymatch <- function (pattern, text) {
m <- regexec( pattern, text, perl = TRUE)
return(m)
}
ifmatched <- function (m) {
if (m[[1]][1] == -1) {
return(FALSE)
}else {
return(TRUE)
}
}
getstring <- function (text, match, index) {
v <- regmatches(text, match)
vec <- v[[1]]
return(vec[index])
}
###### MAIN ENTRY #########
cmd <- commandArgs(trailingOnly = TRUE)
id <- cmd[1]
filename <- cmd[2]
fh <- file(filename,open="r")
inside <- FALSE
seq <- c("")
while ( length(line <- readLines(fh,warn=FALSE, n=1))> 0 ){
idpattern <- "\\>(\\S+)\\s+"
#commapattern <- "\\|\\s+(.*)\\,\\s+"
#pattern <- "\\|\\s+(.*)\\s+\\["
m <- mymatch (idpattern, line)
if (!(ifmatched(m))) {
if (inside) {
seq <- paste(seq, line)
}else {
next
}
}else {
# cat (getstring(line, m, 2))
fileid <- etstring(line, m, 2)
if (fileid == id) {
seq <- paste(seq, line)
inside <- TRUE
}else {
if (inside) {
cat (seq)
break
}
}
}
}
close(fh)
cat (seq)
e=seq(0,60,10)
a=1-10^(-e/10)
plot(e,a,xlab="Phred score",ylab="Accuracy")
lines(e,a)
abline(v=20,col="red")
su -c "yum install java-1.8.0-openjdk"
download fastqc_v0.11.5.zip and install it
download trimmomatic and install it
jave -jar trimmomatic-0.36.jar PE adrenal_R1.fq adrenal_R2.fq adrenal_trimmed_R1.fq adrenal_R1_u.fq adrenal_trimmed_R2.fq adrenal_R2_u.fq ILLUMINACLIP:Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:2 TRAILING:2 SLIDINGWINDOW:4:15 MINLEN:30
Sailfish: Rapid Alignment free Quantification of Isoform Abundance
transition transversion heraclitean fire chargaff
pyrimidines,purines
assembly is different from alignment
GATK and Freebayes
https://udall-lab.byu.edu/Research/Software/BamBam
file 文件名 #查看文件类型
yum provides aclocal
autogen.sh=>configure=>makefile=>make
perl script vcf-annotate (in vcftools)
RPKM 在样本之间不能比较
RSEM=> TPM value normalized the RPKM, but also cannot compared between multiple samples
样本间 edgeR TMM normalizion
DeSeq normalizion Up Quantile normalizaion 表达量数据是非线性的,所以用quantile的好,median是统计线性的数据
If there are only 2 different conditions among the samples, four statistics (columns) will be reported for each gene/transcript. They are "PPEE", "PPDE", "PostFC" and "RealFC". "PPEE" is the posterior probability (estimated by EBSeq) that a gene/transcript is equally expressed. "PPDE" is the posterior probability that a gene/transcript is differentially expressed. "PostFC" is the posterior fold change (condition 1 over condition2) for a gene/transcript. It is defined as the ratio between posterior mean expression estimates of the gene/transcript for each condition. "RealFC" is the real fold change (condition 1 over condition2) for a gene/transcript. It is the ratio of the normalized within condition 1 mean count over normalized within condition 2 mean count for the gene/transcript. Fold changes are calculated using EBSeq's 'PostFC' function. The genes/transcripts are reported in descending order of their "PPDE" values.
4 control 3 treatment,之间的fold change, 统计的是拟合的两条线之间的fold change