Last active
December 28, 2015 03:59
-
-
Save hillarysanders/7439581 to your computer and use it in GitHub Desktop.
hackbright example functions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################### | |
# outliers: | |
# supah simple outlier killer. Also kills right tails. | |
##' @param trim trim what % of the data off each tail before calcuating the mean and sd. | |
##' @param z how many standard deviations will you allow observations to be away from the | |
##' mean before you consider it an outlier? | |
##' @param verbose do you like talkative functions? | |
##' @param flag.only logical (TRUE or FALSE). | |
##' Only flag the outliers as outliers, don't remove them. | |
trimmed.normal <- function(spec.df, trim=.05, z=4, verbose=TRUE, | |
flag.only=F){ | |
price <- spec.df$price | |
# get quantiles: | |
q <- quantile(price, probs=c(trim, 1-trim)) | |
# get trimmed mean and standard deviation: | |
m <- mean(subset(price, subset=(price > q[1] & price < q[2]))) | |
std.dev <- sd(subset(price, subset=(price >= q[1] & price <= q[2]))) | |
if(std.dev>0){ | |
remove.idx <- price > m+z*std.dev | price < m-z*std.dev | |
if(verbose==TRUE){ | |
print(paste("removed", sum(remove.idx), "out of", nrow(spec.df), "observations", | |
"from", spec.df$spec_name[nrow(spec.df)], ",", toupper(spec.df$country[1]))) | |
} | |
if(flag.only==F){ | |
spec.df <- spec.df[!remove.idx, ] | |
} else { | |
spec.df$outlier <- remove.idx | |
} | |
} | |
return(spec.df) | |
} | |
library(stringr) # could not use this if neccessary, but it is handy | |
##' unit.standardizer | |
##' not be obvious as to why unless the size units were being prominently shown. | |
##' @param spec.df price observation data.frame | |
##' @param verbose feeling talkative? | |
##' @param safety.first Only modify prices by size if the change brings the price | |
##' closer to the mean price? | |
##' @param coerce.unit.only | |
##' @param prep.only prep the size metadata, but do nothing else? Logical. | |
##' @param require.all.three only try to coerce to something if the most common types | |
##' of size, size_unit and quantity are non-NA. logical | |
##' @param require.all.for.conversion for each observation, only coerce if size, | |
##' size_unit, and quantity are non-NA. | |
##' @param country Only matters for China; they treat define certain units differently. | |
##' If china, let country = 'cn'. | |
unit.standardizer <- function(spec.df, verbose=FALSE, safety.first=FALSE, | |
coerce.unit.only=F, allow.crappy.common.units=TRUE, | |
require.all.three=F, require.all.for.conversion=FALSE, | |
only.return.most.common.unit=F, prep.only=FALSE, | |
country="notchina"){ | |
# so that stuff can be easily modified: | |
size <- as.character(spec.df$size) | |
unit <- as.character(spec.df$size_unit) | |
quantity <- as.character(spec.df$quantity) | |
price <- spec.df$price | |
spec.df$price.raw <- spec.df$price | |
# translate chinese | |
unit = sub("克", "g", unit) | |
unit = sub("公斤", "kg", unit) | |
unit = sub("毫升", "ml", unit) | |
unit = sub("升", "ltr", unit) | |
unit = sub("盎司", "oz", unit) | |
unit = sub("磅", "lb", unit) | |
unit = sub("斤", "catty", unit) | |
unit = sub("两", "tael", unit) | |
unit = sub("盒", "box", unit) | |
unit = sub("个", "pcs.", unit) | |
unit = sub("袋", "pack", unit) | |
size[size=="0"] <- NA | |
unit[unit=="0"] <- NA | |
quantity[quantity=="0"] <- NA | |
size[size==""] <- NA | |
unit[unit==""] <- NA | |
quantity[quantity==""] <- NA | |
size[size=="NULL"] <- NA | |
unit[unit=="NULL"] <- NA | |
quantity[quantity=="NULL"] <- NA | |
unit[grepl(unit, pattern="[/?]+")] <- NA | |
quantity[grepl(quantity, pattern="[/?]+")] <- NA | |
size[grepl(size, pattern="[/?]+")] <- NA | |
# throw away any character fluff (may want to conserve and compare instead in the future) | |
size <- gsub("[a-z]+", "", x=size) | |
size <- gsub(" ", "", x=size) | |
size <- gsub(",", ".", x=size) | |
size <- as.numeric(size) | |
size[size==0] <- NA | |
quantity <- gsub("[a-z]+", "", x=quantity) | |
quantity <- gsub(" ", "", x=quantity) | |
quantity <- as.numeric(quantity) | |
quantity[quantity==0] <- NA | |
# now that they are cleaner, replace spec.df size and quantity w/ these | |
spec.df$size <- size | |
spec.df$quantity <- quantity | |
spec.df$size_unit <- unit | |
if(prep.only==TRUE) return(spec.df) | |
if(allow.crappy.common.units==F){ | |
unit.idx <- ! unit %in% c("box", "pcs.", "pack") | |
} else { unit.idx <- 1:length(unit) } | |
most.common <- names(sort(table((paste(quantity, size, unit[unit.idx])), useNA='ifany'), decreasing=TRUE))[1] | |
most.common. <- str_split(most.common, " ")[[1]] | |
most.common.[most.common.=="NA"] <- NA | |
quantity.mode <- as.numeric(most.common.[1]) | |
size.mode <- as.numeric(most.common.[2]) | |
unit.mode <- most.common.[3] | |
# only work with workable data, yo. | |
if(require.all.three==TRUE){ | |
if(any(is.na(c(size.mode, unit.mode, quantity.mode)))) return(spec.df) | |
} | |
# NOTE THAT THESE MOST COMMON VALUES ARE POTENTIALLY UNSTABLE. | |
# FIXED IN PRODUCTIONIZED VERSION. | |
names <- c("catty", "g", "kg", "lb", "oz", "ltr", "ml", "l", "tael") | |
# if COUNTRY = CHINA, then catty is 500 g, not 600, and a tael is 10 catties, not 16. | |
if(country=="cn"){ | |
conversion_matrix <- matrix( | |
#"catty" "g" "kg" "lb" "oz" "ltr" "ml" "l" "tael" | |
c(1, 500, .5, 1.1025, 17.64, .5 , 500, .5, 10, #catty | |
1/500, 1, .001, 0.002205, 0.03527, .001, 1, .001, 10/500, #g | |
1/.5, 1000, 1, 2.205, 35.27, 1, 1000, 1, 10/.5, #kg | |
1/1.1025, 0, 1/2.205, 1, 16, .4536, 453.6, .4536, 10/1.1025, #lb | |
1/17.64, 1/0.03527, 1/35.27, 1/16, 1, 0.02957, 29.57, .02957, 10/17.64, #oz | |
1/.5, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 10/.5, #ltr | |
1/500, 1, .001, 1/453.6, 1/29.57, .001, 1, .001, 10/500, #ml | |
1/.5, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 10/.5, #l | |
1/10, 500/10, .5/10, 1.1025/10, 17.64/10, .5/10, 500/10, .5/10, 1), # tael | |
ncol=9, dimnames=list(names, names)) | |
} else { | |
conversion_matrix <- matrix(c( | |
#"catty" "g" "kg" "lb" "oz" "ltr" "ml" "l" "tael" | |
1, 600, .6, 1.323, 21.168, .6 , 600, .6, 16, #catty | |
1/600, 1, .001, 0.002205, 0.03527, .001, 1, .001, 16/600, #g | |
1/.6, 1000, 1, 2.205, 35.27, 1, 1000, 1, 16/.6, #kg | |
1/1.323, 0, 1/2.205, 1, 16, .4536, 453.6, .4536, 16/1.323, #lb | |
1/21.168, 1/0.03527, 1/35.27, 1/16, 1, 0.02957, 29.57, .02957, 16/21.168, #oz | |
1/.6, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 16/.6, #ltr | |
1/600, 1, .001, 1/453.6, 1/29.57, .001, 1, .001, 16/600, #ml | |
1/.6, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 16/.6, #l | |
1/16, 600/16, .6/16, 1.323/16, 21.168/16, .6/16, 600/16, .6/16, 1), # tael | |
ncol=9, dimnames=list(names, names)) | |
} | |
quant.change <- quantity.mode / quantity | |
size.change <- size.mode / size | |
# unit change is a bit more complicated: | |
col <- which(names == unit.mode) | |
rows <- sapply(unit, FUN=function(x) which(names == x)) | |
unit.change <- sapply(rows, FUN=function(x){ | |
change <- conversion_matrix[x, col] | |
if(length(change)==1) return(change) else return(NA) | |
}) | |
if(coerce.unit.only){ | |
proposed.price <- price | |
} else { | |
if(require.all.for.conversion==TRUE){ | |
proposed.price <- price*quant.change*size.change*unit.change | |
} else { | |
quant.change[is.na(quant.change)] <- 1 | |
unit.change[is.na(unit.change)] <- 1 | |
size.change[is.na(size.change)] <- 1 | |
proposed.price <- price*quant.change*size.change*unit.change | |
} | |
} | |
# good idx = those observations for which all units were transformable: | |
good.idx <- which(!is.na(proposed.price)) | |
if(safety.first==TRUE){ | |
# only accept proposed changes if it brings you closer to the most.common mean: | |
mu <- mean(price[paste(quantity, size, unit)==most.common]) | |
# sd <- sd(price[paste(quantity, size, unit)==most.common]) | |
makes.it.better <- abs(mu-price) > abs(mu-proposed.price) | |
makes.it.better <- which(makes.it.better) | |
} else { | |
makes.it.better <- 1:length(price) | |
} | |
makes.it.better <- intersect(good.idx, makes.it.better) | |
if(verbose==TRUE){ | |
print(paste("Altering", sum(price!=proposed.price, na.rm=T), | |
"out of", length(price), "sizes and respective prices.")) | |
} | |
price[makes.it.better] <- proposed.price[makes.it.better] | |
unit.idx <- makes.it.better[which((unit.change!=1)[makes.it.better])] | |
unit[unit.idx] <- unit.mode | |
if(coerce.unit.only){ | |
# units have been changed so size should chnage to reflect this | |
size[unit.idx] <- size[unit.idx]/unit.change[unit.idx] | |
} else { | |
size[makes.it.better] <- size.mode | |
quantity[makes.it.better] <- quantity.mode | |
unit[makes.it.better] <- unit.mode | |
} | |
spec.df$price <- price | |
spec.df$size <- size | |
spec.df$quantity <- quantity | |
spec.df$size_unit <- unit | |
if(only.return.most.common.unit==TRUE){ | |
if(!is.na(unit.mode)){ | |
idx.good.size_unit <- spec.df$size_unit == unit.mode | |
idx.good.size_unit[is.na(idx.good.size_unit)] <- F | |
} else { | |
# idx.good.size_unit <- is.na(spec.df$size_unit) | |
idx.good.size_unit <- rep(TRUE, nrow(spec.df)) | |
} | |
if(verbose==T) print(paste("Removing", sum(!idx.good.size_unit), "obs whose units cannot be coerced.")) | |
spec.df <- spec.df[idx.good.size_unit, ] | |
} | |
# hmm. One to one increases in price due to size changes doesn't represent how the real | |
# world sells stuff... | |
return(spec.df) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment