Last active
June 6, 2018 02:12
-
-
Save hliang/d2aa8b6e07d136492c81b452cc02b795 to your computer and use it in GitHub Desktop.
misc R scripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(zoo) | |
# library("lubridate") | |
collectl_fn = list.files("D:/tmp/abruijn/", patter=".*.p.txt", full.names = TRUE); collectl_fn | |
zcolall = list() | |
slurmout_all = list() | |
for (i in seq_along(collectl_fn)) { | |
jobid = gsub(".*mon_(\\d+)-(cn\\d+).*.p.txt", "\\1", collectl_fn[i]) | |
nodeid = gsub(".*mon_(\\d+)-(cn\\d+).*.p.txt", "\\2", collectl_fn[i]) | |
dcol = read.table(collectl_fn[i]) | |
colnames(dcol) = unlist(strsplit("Date Time [CPU]User% [CPU]Nice% [CPU]Sys% [CPU]Wait% [CPU]Irq% [CPU]Soft% [CPU]Steal% [CPU]Idle% [CPU]Totl% [CPU]Guest% [CPU]GuestN% [CPU]Intrpt/sec [CPU]Ctx/sec [CPU]Proc/sec [CPU]ProcQue [CPU]ProcRun [CPU]L-Avg1 [CPU]L-Avg5 [CPU]L-Avg15 [CPU]RunTot [CPU]BlkTot [MEM]Tot [MEM]Used [MEM]Free [MEM]Shared [MEM]Buf [MEM]Cached [MEM]Slab [MEM]Map [MEM]Anon [MEM]AnonH [MEM]Commit [MEM]Locked [MEM]SwapTot [MEM]SwapUsed [MEM]SwapFree [MEM]SwapIn [MEM]SwapOut [MEM]Dirty [MEM]Clean [MEM]Laundry [MEM]Inactive [MEM]PageIn [MEM]PageOut [MEM]PageFaults [MEM]PageMajFaults [MEM]HugeTotal [MEM]HugeFree [MEM]HugeRsvd [MEM]SUnreclaim [NET]RxPktTot [NET]TxPktTot [NET]RxKBTot [NET]TxKBTot [NET]RxCmpTot [NET]RxMltTot [NET]TxCmpTot [NET]RxErrsTot [NET]TxErrsTot [DSK]ReadTot [DSK]WriteTot [DSK]OpsTot [DSK]ReadKBTot [DSK]WriteKBTot [DSK]KbTot [DSK]ReadMrgTot [DSK]WriteMrgTot [DSK]MrgTot", " ")) | |
head(dcol) | |
dcol$datetime = paste(dcol$Date, dcol$Time) | |
dcol$datetime = strptime(dcol$datetime, "%Y%m%d %H:%M:%S", tz = "Hongkong") | |
#dcol$datetime = with_tz(dcol$datetime, tzone = "GMT") | |
x = dcol | |
x[, c("Date", "Time", "datetime")] = NULL | |
zcol = zoo(x, order.by=dcol$datetime) | |
zcolall[[jobid]] = zcol | |
## slurm-xxx.out | |
slurmout_fn = list.files("D:/tmp/abruijn/", patter=paste0("slurm-", jobid, ".out"), full.names = TRUE); slurmout_fn | |
if(length(slurmout_fn)==1) { | |
library(stringr) | |
op <- options(digits.secs=3) | |
x = scan(slurmout_fn, what="", sep="\n") | |
x = grep(pattern="INFO", x=x, value=TRUE) | |
#x = data.frame(str_split_fixed(x, " ", 4), stringsAsFactors=FALSE) | |
x = data.frame(str_split_fixed(x, "[\\[\\]] *", 3), stringsAsFactors=FALSE) | |
x[, 1] = NULL | |
colnames(x) = c("datetime", "event") | |
x$datetime = paste0(x$datetime, ".", sprintf("%03d", seq_along(x$datetime))) # add milliseconds, to avoid duplicate timestamp | |
tmp = strptime(x$datetime, "%Y-%m-%d %H:%M:%OS", tz = "Hongkong") | |
x$datetime = tmp | |
x$event = gsub(pattern="INFO: ", replacement="", x=x$event) | |
zevent = zoo(x[, "event", drop=FALSE], order.by=x$datetime) | |
options(op) | |
} else { | |
stop("Need one slurm-xxx.out, but found ", length(slurmout_fn), ":\n", slurmout_fn) | |
} | |
slurmout_all[[jobid]] = zevent | |
} | |
head(zcol) | |
head(zevent) | |
########################################################## | |
## plot CPU | |
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%") | |
plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU usage %", xlab="min", xaxt="n", type="s") | |
title(sub=paste0("job=", jobid, " node=", nodeid)) | |
legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot)) | |
tick_interval = as.difftime("00:01:00") # 1 minutes | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 ) | |
tick_interval = as.difftime("00:05:00") # 5 minutes | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); #tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5) | |
## plot memory | |
#colsToPlot = c("[MEM]Tot", "[MEM]Used", "[MEM]Free", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached") | |
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached") | |
plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), ylab="Memory /GB", xlab="min", xaxt="n", type="s") | |
title(sub=paste0("job=", jobid, " node=", nodeid)) | |
legend(x="topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot)) | |
# legend(x="top", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot), ncol=ceiling(length(colsToPlot)/2), inset=c(0.0,-0.12), xpd=TRUE) | |
tick_interval = as.difftime("00:01:00") # 1 minutes | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 ) | |
tick_interval = as.difftime("00:05:00") # 5 minutes | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5) | |
########################################################## | |
## plot multiple (cpu and mem) in one figure | |
par(mar=c(5, 4, 5, 4)+0.1) | |
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%") | |
plot.zoo(zcol[, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s") | |
axis(side=2) | |
mtext(text="CPU %", side=2, line=2) | |
legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot)) | |
tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units")) | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5) | |
# tick_interval = as.difftime("01:00:00") # 1 hours | |
# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=NA, tcl=-0.25 ) | |
# | |
# tick_interval = as.difftime("05:00:00") # 5 hours | |
# tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
# axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5) | |
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units"))) | |
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached") | |
par(new = TRUE) | |
plot.zoo(zcol[, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=2) | |
legend(x = "topright", legend = colsToPlot, bty = "n", lty = 2, col = seq_along(colsToPlot)) | |
axis(side = 4) | |
mtext(text="Memory /GB", side=4, line=2) | |
########################################################## | |
## milestone marks | |
axis.POSIXct(time(zevent), side=3, at=time(zevent), labels=zevent$event, las=2, cex.axis=0.6) | |
########################################################## | |
## prefer simple plot | |
########################################################## | |
pdf("~/abruijn/abruijn_arabidopsis.pdf", width=10, height=7) | |
par(mar=c(5, 4, 5, 4)+0.1) | |
rowsToPlot = seq(1, nrow(zcol), by=12) | |
colsToPlot = c("[CPU]Totl%") | |
colsToPlot = c("[CPU]Totl%", "[CPU]User%", "[CPU]Sys%", "[CPU]Wait%") | |
plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col=seq_along(colsToPlot), ylim=c(0, 100), ylab="CPU %", xlab="", xaxt="n", type="s") | |
title(sub=paste0("job=", jobid, " node=", nodeid)) | |
legend(x = "topright", legend = colsToPlot, bty = "n", lty = c(1), col = seq_along(colsToPlot)) | |
# | |
# plot.zoo(zcol[rowsToPlot, colsToPlot], plot.type = "single", col="red", ylim=c(0, 100), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1) | |
# axis(side=2) | |
# mtext(text="CPU %", side=2, line=2) | |
# legend(x = "topleft", legend = colsToPlot, bty = "n", lty = 1, col = "red") | |
tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units")) | |
tick_interval = as.difftime("06:00:00") # hours | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5) | |
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units"))) | |
colsToPlot = c("[MEM]Used") | |
colsToPlot = c("[MEM]Used", "[MEM]Shared", "[MEM]Buf", "[MEM]Cached") | |
#par(new = TRUE) | |
plot.zoo(zcol[rowsToPlot, colsToPlot]/1E6, plot.type = "single", col=seq_along(colsToPlot), xaxt="n", yaxt="n", ann=FALSE, type="s", lty=1) | |
legend(x = "topright", legend = colsToPlot, bty = "n", lty = 1, col = seq_along(colsToPlot)) | |
axis(side = 4) | |
mtext(text="Memory /GB", side=4, line=2) | |
tick_interval = as.difftime(1, units=attr(diff(range(time(zcol))), "units")) | |
tick_interval = as.difftime("06:00:00") # hours | |
tick_pos = seq(min(time(zcol)), max(time(zcol)), by = tick_interval); tick_pos | |
axis.POSIXct(time(zcol), side=1, at=tick_pos, labels=(seq_along(tick_pos)-1)*tick_interval, tcl=-0.5) | |
title(sub=paste0("job=", jobid, " node=", nodeid), xlab=paste0(" ", attr(tick_interval, "units"))) | |
dev.off() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
set.seed( 1) #设置随机种子 | |
dates<- as.Date( '2010-01-01')+ 1: 100#100个日期 | |
x<-round(rnorm( 100, 50, 40), 2) #随机生成X产品,100个正态分析的收盘价 | |
y<-round(rnorm( 100, 50, 40), 2) #随机生成Y产品,100个正态分析的收盘价 | |
df<-data.frame(dates,x,y) | |
df | |
library(ggplot2) | |
library(scales) | |
library(reshape2) | |
df2<-melt(df,c('dates')) | |
g<-ggplot(data=df2,aes(x=dates,y=value,colour=variable)) | |
g<-g+geom_line() | |
g<-g+scale_x_date(date_breaks = "1 week",date_labels='%m-%d') | |
g<-g+labs(x='date',y='Price') | |
g | |
# 计算差价 | |
df$diff<-df$x-df$y | |
# 找到差价大于10时的点 | |
idx<-which(df$diff> 10) | |
idx<-idx[-which(diff(idx)== 1)- 1] | |
# 打印差价的索引值 | |
idx | |
# 接下来,我们进行模拟交易,取第一个索引值的点,在2010-01-04时做空X,做多Y。当差价小于0在2010-01-06时,进行平仓。 | |
# 打印前20个数据 | |
head(df, 20) | |
# 当差价大于10时,做空X,当差价小于0时,平仓。 | |
# 第4行做空,第6行平仓 | |
xprofit<- df$x[4]-df$x[6]; xprofit | |
# 当差价大于10时,做多Y;当差价小于0时,平仓。 | |
# 第4行做空,第6行平仓 | |
yprofit<- df$y[6]-df$y[4]; yprofit | |
# 这是为什么呢? | |
# 根据配对交易的假设条件,如果两个金融产品的价差是收敛的,通过协整性检验的方法,我们可验证数据的收敛性。那么如果数据是收敛的,他还会具备均值回归的特性,请参考文章 均值回归,逆市中的投资机会。 | |
# 画出X,Y的价差图,我们可以明显的看出,价差一直围绕着0上下波动,这是明显收敛的,同时符合均值回归的特性。 | |
plot(df$diff, type= 'l') | |
x = data.frame(var01=c("十全大补丸[成分]甲、乙、丙", "含笑半步颠[成分]丁、戊、己"), | |
var02=c("haha", "hehe"), stringsAsFactors=FALSE) | |
gsub("\\[.*", "", x$var01) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# converting rice gene ids | |
library("biomaRt") | |
listMarts(host="plants.ensembl.org") | |
ensembl=useMart("plants_mart", host="plants.ensembl.org") | |
x = listDatasets(ensembl) | |
ensembl = useDataset("osativa_eg_gene", mart=ensembl) | |
att = listAttributes(ensembl) | |
att | |
ensIDs = c("OS01G0232100", "OS04G0691100", "OS02G0475400") | |
refseqPepIDs = c("XP_015633962.1", "XP_015635635.1", "XP_015623178.1", "XP_015614092.1", "XP_015614100.1") | |
myres1 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'), | |
filters = 'ensembl_gene_id', | |
values = ensIDs, | |
mart = ensembl) | |
myres2 = getBM(attributes=c('ensembl_gene_id', 'refseq_peptide'), | |
filters = 'refseq_peptide', | |
values = refseqPepIDs, | |
mart = ensembl) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
rm(list=ls()) | |
############################################################## | |
## summarize how many visits does each user have each day | |
## simulate data | |
set.seed(1) | |
n = 800 # total number of requests in the data set | |
uid = sample(LETTERS[1:20], n, replace=TRUE) # user id | |
st = Sys.time(); st # pick a start time for the data set | |
et = st + as.difftime(10, units="days") # pick an end time for the data set | |
dt = et - st; #dt # overall duration of the dataset | |
ev = as.difftime(sort(runif(n, 0, dt)), units="days"); ev # generate random difftime | |
requesttime = st + ev; head(requesttime) # generate random start time for each request | |
requestDate = as.Date(requesttime, tz=Sys.timezone()); head(requestDate) # start date | |
df = data.frame(requestDate, uid, requesttime) # put them in one data frame | |
head(df); dim(df) | |
#df | |
## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit | |
cutoff = as.difftime(1, units="hours") | |
## dataframe 'visits' summarize how many visits a user have on each day | |
visits = aggregate(df$requesttime, by=list(uid=df$uid, requestDate=df$requestDate), FUN=function(x){sum(diff(x) > cutoff) + 1}); head(visits) | |
#res | |
# plot(x~requestDate, data=res, type="l") | |
# plot | |
library(ggplot2) | |
(p1 <- ggplot(data = visits, aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point()) | |
## top frequent users | |
# mean2 = mean(visits$x) * 1.2; mean2 | |
totalVisitsPerUser = aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){sum(x)}) | |
# top 10% users | |
totalVisitsPerUser = totalVisitsPerUser[order(totalVisitsPerUser$x, decreasing=TRUE), ] | |
topUsers = totalVisitsPerUser[1:floor(nrow(totalVisitsPerUser) * 0.10), ] | |
# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)}) | |
# aggregate(visits$x, by=list(uid=visits$uid), FUN=function(x){mean(x)>mean2}) | |
(p1 <- ggplot(data = visits[visits$uid==topUsers$uid, ], aes(x = requestDate, y = x, colour = uid)) + geom_line() + geom_point()) | |
###################################################### | |
last_raw = read.table("F:/last/last_ln0", header=FALSE, stringsAsFactors=FALSE, nrows=13794) | |
last_raw$time = paste(last_raw$V4, last_raw$V5, last_raw$V6, last_raw$V7); #last_raw$time | |
last_raw$time = strptime(last_raw$time, format="%a %b %e %R") | |
last_raw$date = as.Date(last_raw$time, tz=Sys.timezone()) | |
last = data.frame(time=last_raw$time, date=last_raw$date, user=last_raw$V1, tty=last_raw$V2, remote=last_raw$V3) | |
#last = last[rev(seq_len(nrow(last))), ] | |
last = last[order(last$time), ] | |
head(last); tail(last); dim(last) | |
## set cutoff. if the time interval of two consecutive request is less than cutoff, they are deemed the same visit | |
cutoff = as.difftime(30, units="mins") | |
## dataframe 'visits' summarize how many visits a user have on each day | |
visits = aggregate(last$time, by=list(uid=last$user, date=last$date), FUN=function(x){sum(abs(diff(x)) > cutoff) + 1}); head(visits) | |
plot(x~date, data=visits) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment