hillarysanders · December 28, 2015 03:59
diff --git a/hackbright_example_functions b/hackbright_example_functions




 ######################
 # outliers:
 # supah simple outlier killer. Also kills right tails.
 ##' @param trim trim what % of the data off each tail before calcuating the mean and sd.
 ##' @param z how many standard deviations will you allow observations to be away from the 
 ##' mean before you consider it an outlier?
 ##' @param verbose do you like talkative functions?
 ##' @param flag.only logical (TRUE or FALSE).
 ##' Only flag the outliers as outliers, don't remove them.
 trimmed.normal <- function(spec.df, trim=.05, z=4, verbose=TRUE,
                           flag.only=F){
  
  price <- spec.df$price
  # get quantiles:
  q <- quantile(price, probs=c(trim, 1-trim))
  
  # get trimmed mean and standard deviation:
  m <- mean(subset(price, subset=(price > q[1] & price < q[2])))
  std.dev <- sd(subset(price, subset=(price >= q[1] & price <= q[2])))
  
  
  
  if(std.dev>0){
    remove.idx <- price > m+z*std.dev | price < m-z*std.dev
    if(verbose==TRUE){
      print(paste("removed", sum(remove.idx), "out of", nrow(spec.df), "observations",
                  "from", spec.df$spec_name[nrow(spec.df)], ",", toupper(spec.df$country[1])))
    }
    if(flag.only==F){
      spec.df <- spec.df[!remove.idx, ]
    } else {
      spec.df$outlier <- remove.idx
    }
  }
  
  return(spec.df)
 }









 library(stringr) # could not use this if neccessary, but it is handy
 ##' unit.standardizer
 ##' not be obvious as to why unless the size units were being prominently shown.
 ##' @param spec.df price observation data.frame
 ##' @param verbose feeling talkative?
 ##' @param safety.first Only modify prices by size if the change brings the price
 ##' closer to the mean price?
 ##' @param coerce.unit.only 
 ##' @param prep.only prep the size metadata, but do nothing else? Logical.
 ##' @param require.all.three only try to coerce to something if the most common types
 ##' of size, size_unit and quantity are non-NA. logical
 ##' @param require.all.for.conversion for each observation, only coerce if size,
 ##' size_unit, and quantity are non-NA.
 ##' @param country Only matters for China; they treat define certain units differently. 
 ##' If china, let country = 'cn'.
 unit.standardizer <- function(spec.df, verbose=FALSE, safety.first=FALSE, 
                              coerce.unit.only=F, allow.crappy.common.units=TRUE,
                              require.all.three=F, require.all.for.conversion=FALSE,
                              only.return.most.common.unit=F, prep.only=FALSE,
                              country="notchina"){
  
  # so that stuff can be easily modified:
  size <- as.character(spec.df$size)
  unit <- as.character(spec.df$size_unit)
  quantity <- as.character(spec.df$quantity)
  price <- spec.df$price
  
  spec.df$price.raw <- spec.df$price
  
  # translate chinese
  unit = sub("克", "g", unit)
  unit = sub("公斤", "kg", unit)
  unit = sub("毫升", "ml", unit)
  unit = sub("升", "ltr", unit)
  unit = sub("盎司", "oz", unit)
  unit = sub("磅", "lb", unit)
  unit = sub("斤", "catty", unit)
  unit = sub("两", "tael", unit)
  unit = sub("盒", "box", unit)
  unit = sub("个", "pcs.", unit)
  unit = sub("袋", "pack",   unit)
  
  size[size=="0"] <- NA
  unit[unit=="0"] <- NA
  quantity[quantity=="0"] <- NA
  
  size[size==""] <- NA
  unit[unit==""] <- NA
  quantity[quantity==""] <- NA
  
  size[size=="NULL"] <- NA
  unit[unit=="NULL"] <- NA
  quantity[quantity=="NULL"] <- NA
  
  unit[grepl(unit, pattern="[/?]+")] <- NA
  quantity[grepl(quantity, pattern="[/?]+")] <- NA
  size[grepl(size, pattern="[/?]+")] <- NA
  
  # throw away any character fluff (may want to conserve and compare instead in the future)
  size <- gsub("[a-z]+", "", x=size)
  size <- gsub(" ", "", x=size)
  size <- gsub(",", ".", x=size)
  size <- as.numeric(size)
  size[size==0] <- NA
  
  quantity <- gsub("[a-z]+", "", x=quantity)
  quantity <- gsub(" ", "", x=quantity)
  quantity <- as.numeric(quantity)
  quantity[quantity==0] <- NA
  # now that they are cleaner, replace spec.df size and quantity w/ these
  spec.df$size <- size
  spec.df$quantity <- quantity
  spec.df$size_unit <- unit
  
  if(prep.only==TRUE) return(spec.df)
  
  if(allow.crappy.common.units==F){
    unit.idx <- ! unit %in% c("box", "pcs.", "pack") 
  } else { unit.idx <- 1:length(unit) }
  
  most.common <- names(sort(table((paste(quantity, size, unit[unit.idx])), useNA='ifany'), decreasing=TRUE))[1]
  most.common. <- str_split(most.common, " ")[[1]]
  
  most.common.[most.common.=="NA"] <- NA
  
  quantity.mode <- as.numeric(most.common.[1])
  size.mode <- as.numeric(most.common.[2])
  unit.mode <- most.common.[3]

  # only work with workable data, yo.
  if(require.all.three==TRUE){
    if(any(is.na(c(size.mode, unit.mode, quantity.mode)))) return(spec.df)
  }
  
  
  # NOTE THAT THESE MOST COMMON VALUES ARE POTENTIALLY UNSTABLE. 
  # FIXED IN PRODUCTIONIZED VERSION.
  
  names <- c("catty", "g", "kg", "lb", "oz", "ltr", "ml", "l", "tael")  
  # if COUNTRY = CHINA, then catty is 500 g, not 600, and a tael is 10 catties, not 16.
  if(country=="cn"){
    conversion_matrix <- matrix(
      #"catty"       "g"         "kg"     "lb"          "oz"     "ltr"     "ml"     "l"     "tael"
      c(1,         500,       .5,       1.1025,      17.64,    .5 ,      500,     .5,           10, #catty
        1/500,     1,         .001,     0.002205,   0.03527,   .001,     1,       .001,     10/500,  #g
        1/.5,      1000,      1,        2.205,      35.27,     1,        1000,    1,         10/.5,  #kg
        1/1.1025,   0,         1/2.205,  1,          16,        .4536,    453.6,   .4536,   10/1.1025,  #lb
        1/17.64,  1/0.03527, 1/35.27,  1/16,       1,         0.02957,  29.57,   .02957, 10/17.64,  #oz
        1/.5,      1000,      1,        1/.4536,    1/0.02957, 1,        1000,    1,         10/.5,  #ltr
        1/500,     1,         .001,     1/453.6,    1/29.57,   .001,     1,       .001,     10/500,  #ml
        1/.5,      1000,      1,        1/.4536,    1/0.02957, 1,        1000,    1,          10/.5,  #l
        1/10,      500/10,    .5/10,    1.1025/10,   17.64/10, .5/10,    500/10,  .5/10,        1), # tael
      ncol=9, dimnames=list(names, names))
  } else {
    conversion_matrix  <- matrix(c(
      #"catty"       "g"         "kg"     "lb"          "oz"     "ltr"     "ml"     "l"     "tael"
      1,         600,       .6,       1.323,      21.168,    .6 ,      600,     .6,           16, #catty
      1/600,     1,         .001,     0.002205,   0.03527,   .001,     1,       .001,     16/600,  #g
      1/.6,      1000,      1,        2.205,      35.27,     1,        1000,    1,         16/.6,  #kg
      1/1.323,   0,         1/2.205,  1,          16,        .4536,    453.6,   .4536,   16/1.323,  #lb
      1/21.168,  1/0.03527, 1/35.27,  1/16,       1,         0.02957,  29.57,   .02957, 16/21.168,  #oz
      1/.6,      1000,      1,        1/.4536,    1/0.02957, 1,        1000,    1,         16/.6,  #ltr
      1/600,     1,         .001,     1/453.6,    1/29.57,   .001,     1,       .001,     16/600,  #ml
      1/.6,      1000,      1,        1/.4536,    1/0.02957, 1,        1000,    1,          16/.6,  #l
      1/16,      600/16,    .6/16,    1.323/16,   21.168/16, .6/16,    600/16,  .6/16,        1), # tael
                                 ncol=9, dimnames=list(names, names))
  }
  
  
  quant.change <- quantity.mode / quantity
  size.change <- size.mode / size
  # unit change is a bit more complicated:
  col <- which(names == unit.mode)
  rows <- sapply(unit, FUN=function(x) which(names == x))
  
  unit.change <- sapply(rows, FUN=function(x){
    change <- conversion_matrix[x, col]
    if(length(change)==1) return(change) else return(NA)
  })
  
  
  if(coerce.unit.only){
    proposed.price  <- price
  } else {
    if(require.all.for.conversion==TRUE){
      proposed.price <- price*quant.change*size.change*unit.change
    } else {
      quant.change[is.na(quant.change)] <- 1
      unit.change[is.na(unit.change)] <- 1
      size.change[is.na(size.change)] <- 1
      proposed.price <- price*quant.change*size.change*unit.change
    }
  }
  
  # good idx = those observations for which all units were transformable:
  good.idx <- which(!is.na(proposed.price))
  
  
  if(safety.first==TRUE){
    # only accept proposed changes if it brings you closer to the most.common mean:
    mu <- mean(price[paste(quantity, size, unit)==most.common])
    #   sd <- sd(price[paste(quantity, size, unit)==most.common])
    makes.it.better <- abs(mu-price) > abs(mu-proposed.price)
    makes.it.better <- which(makes.it.better)
  } else {
    makes.it.better <- 1:length(price)
  }
  
  makes.it.better <- intersect(good.idx, makes.it.better)
  
  if(verbose==TRUE){
    print(paste("Altering", sum(price!=proposed.price, na.rm=T),
                "out of", length(price), "sizes and respective prices."))
  }
  
  price[makes.it.better] <- proposed.price[makes.it.better]
  
  unit.idx <- makes.it.better[which((unit.change!=1)[makes.it.better])]
  unit[unit.idx] <- unit.mode
  if(coerce.unit.only){
    # units have been changed so size should chnage to reflect this
    size[unit.idx] <- size[unit.idx]/unit.change[unit.idx]
  } else {
    size[makes.it.better] <- size.mode
    quantity[makes.it.better] <- quantity.mode
    unit[makes.it.better] <- unit.mode
  }
  
  spec.df$price <- price
  spec.df$size <- size
  spec.df$quantity <- quantity
  spec.df$size_unit <- unit
  
  if(only.return.most.common.unit==TRUE){
    if(!is.na(unit.mode)){
      idx.good.size_unit <- spec.df$size_unit == unit.mode
      idx.good.size_unit[is.na(idx.good.size_unit)] <- F
    } else {
      #       idx.good.size_unit <- is.na(spec.df$size_unit)
      idx.good.size_unit <- rep(TRUE, nrow(spec.df))
    }
    
    if(verbose==T) print(paste("Removing", sum(!idx.good.size_unit), "obs whose units cannot be coerced."))
    spec.df <- spec.df[idx.good.size_unit, ] 
  }
  
  # hmm. One to one increases in price due to size changes doesn't represent how the real
  # world sells stuff...
  return(spec.df)
 }




	######################
	# outliers:
	# supah simple outlier killer. Also kills right tails.
	##' @param trim trim what % of the data off each tail before calcuating the mean and sd.
	##' @param z how many standard deviations will you allow observations to be away from the
	##' mean before you consider it an outlier?
	##' @param verbose do you like talkative functions?
	##' @param flag.only logical (TRUE or FALSE).
	##' Only flag the outliers as outliers, don't remove them.
	trimmed.normal <- function(spec.df, trim=.05, z=4, verbose=TRUE,
	flag.only=F){

	price <- spec.df$price
	# get quantiles:
	q <- quantile(price, probs=c(trim, 1-trim))

	# get trimmed mean and standard deviation:
	m <- mean(subset(price, subset=(price > q[1] & price < q[2])))
	std.dev <- sd(subset(price, subset=(price >= q[1] & price <= q[2])))



	if(std.dev>0){
	remove.idx <- price > m+zstd.dev \| price < m-zstd.dev
	if(verbose==TRUE){
	print(paste("removed", sum(remove.idx), "out of", nrow(spec.df), "observations",
	"from", spec.df$spec_name[nrow(spec.df)], ",", toupper(spec.df$country[1])))
	}
	if(flag.only==F){
	spec.df <- spec.df[!remove.idx, ]
	} else {
	spec.df$outlier <- remove.idx
	}
	}

	return(spec.df)
	}









	library(stringr) # could not use this if neccessary, but it is handy
	##' unit.standardizer
	##' not be obvious as to why unless the size units were being prominently shown.
	##' @param spec.df price observation data.frame
	##' @param verbose feeling talkative?
	##' @param safety.first Only modify prices by size if the change brings the price
	##' closer to the mean price?
	##' @param coerce.unit.only
	##' @param prep.only prep the size metadata, but do nothing else? Logical.
	##' @param require.all.three only try to coerce to something if the most common types
	##' of size, size_unit and quantity are non-NA. logical
	##' @param require.all.for.conversion for each observation, only coerce if size,
	##' size_unit, and quantity are non-NA.
	##' @param country Only matters for China; they treat define certain units differently.
	##' If china, let country = 'cn'.
	unit.standardizer <- function(spec.df, verbose=FALSE, safety.first=FALSE,
	coerce.unit.only=F, allow.crappy.common.units=TRUE,
	require.all.three=F, require.all.for.conversion=FALSE,
	only.return.most.common.unit=F, prep.only=FALSE,
	country="notchina"){

	# so that stuff can be easily modified:
	size <- as.character(spec.df$size)
	unit <- as.character(spec.df$size_unit)
	quantity <- as.character(spec.df$quantity)
	price <- spec.df$price

	spec.df$price.raw <- spec.df$price

	# translate chinese
	unit = sub("克", "g", unit)
	unit = sub("公斤", "kg", unit)
	unit = sub("毫升", "ml", unit)
	unit = sub("升", "ltr", unit)
	unit = sub("盎司", "oz", unit)
	unit = sub("磅", "lb", unit)
	unit = sub("斤", "catty", unit)
	unit = sub("两", "tael", unit)
	unit = sub("盒", "box", unit)
	unit = sub("个", "pcs.", unit)
	unit = sub("袋", "pack", unit)

	size[size=="0"] <- NA
	unit[unit=="0"] <- NA
	quantity[quantity=="0"] <- NA

	size[size==""] <- NA
	unit[unit==""] <- NA
	quantity[quantity==""] <- NA

	size[size=="NULL"] <- NA
	unit[unit=="NULL"] <- NA
	quantity[quantity=="NULL"] <- NA

	unit[grepl(unit, pattern="[/?]+")] <- NA
	quantity[grepl(quantity, pattern="[/?]+")] <- NA
	size[grepl(size, pattern="[/?]+")] <- NA

	# throw away any character fluff (may want to conserve and compare instead in the future)
	size <- gsub("[a-z]+", "", x=size)
	size <- gsub(" ", "", x=size)
	size <- gsub(",", ".", x=size)
	size <- as.numeric(size)
	size[size==0] <- NA

	quantity <- gsub("[a-z]+", "", x=quantity)
	quantity <- gsub(" ", "", x=quantity)
	quantity <- as.numeric(quantity)
	quantity[quantity==0] <- NA
	# now that they are cleaner, replace spec.df size and quantity w/ these
	spec.df$size <- size
	spec.df$quantity <- quantity
	spec.df$size_unit <- unit

	if(prep.only==TRUE) return(spec.df)

	if(allow.crappy.common.units==F){
	unit.idx <- ! unit %in% c("box", "pcs.", "pack")
	} else { unit.idx <- 1:length(unit) }

	most.common <- names(sort(table((paste(quantity, size, unit[unit.idx])), useNA='ifany'), decreasing=TRUE))[1]
	most.common. <- str_split(most.common, " ")[[1]]

	most.common.[most.common.=="NA"] <- NA

	quantity.mode <- as.numeric(most.common.[1])
	size.mode <- as.numeric(most.common.[2])
	unit.mode <- most.common.[3]

	# only work with workable data, yo.
	if(require.all.three==TRUE){
	if(any(is.na(c(size.mode, unit.mode, quantity.mode)))) return(spec.df)
	}


	# NOTE THAT THESE MOST COMMON VALUES ARE POTENTIALLY UNSTABLE.
	# FIXED IN PRODUCTIONIZED VERSION.

	names <- c("catty", "g", "kg", "lb", "oz", "ltr", "ml", "l", "tael")
	# if COUNTRY = CHINA, then catty is 500 g, not 600, and a tael is 10 catties, not 16.
	if(country=="cn"){
	conversion_matrix <- matrix(
	#"catty" "g" "kg" "lb" "oz" "ltr" "ml" "l" "tael"
	c(1, 500, .5, 1.1025, 17.64, .5 , 500, .5, 10, #catty
	1/500, 1, .001, 0.002205, 0.03527, .001, 1, .001, 10/500, #g
	1/.5, 1000, 1, 2.205, 35.27, 1, 1000, 1, 10/.5, #kg
	1/1.1025, 0, 1/2.205, 1, 16, .4536, 453.6, .4536, 10/1.1025, #lb
	1/17.64, 1/0.03527, 1/35.27, 1/16, 1, 0.02957, 29.57, .02957, 10/17.64, #oz
	1/.5, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 10/.5, #ltr
	1/500, 1, .001, 1/453.6, 1/29.57, .001, 1, .001, 10/500, #ml
	1/.5, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 10/.5, #l
	1/10, 500/10, .5/10, 1.1025/10, 17.64/10, .5/10, 500/10, .5/10, 1), # tael
	ncol=9, dimnames=list(names, names))
	} else {
	conversion_matrix <- matrix(c(
	#"catty" "g" "kg" "lb" "oz" "ltr" "ml" "l" "tael"
	1, 600, .6, 1.323, 21.168, .6 , 600, .6, 16, #catty
	1/600, 1, .001, 0.002205, 0.03527, .001, 1, .001, 16/600, #g
	1/.6, 1000, 1, 2.205, 35.27, 1, 1000, 1, 16/.6, #kg
	1/1.323, 0, 1/2.205, 1, 16, .4536, 453.6, .4536, 16/1.323, #lb
	1/21.168, 1/0.03527, 1/35.27, 1/16, 1, 0.02957, 29.57, .02957, 16/21.168, #oz
	1/.6, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 16/.6, #ltr
	1/600, 1, .001, 1/453.6, 1/29.57, .001, 1, .001, 16/600, #ml
	1/.6, 1000, 1, 1/.4536, 1/0.02957, 1, 1000, 1, 16/.6, #l
	1/16, 600/16, .6/16, 1.323/16, 21.168/16, .6/16, 600/16, .6/16, 1), # tael
	ncol=9, dimnames=list(names, names))
	}


	quant.change <- quantity.mode / quantity
	size.change <- size.mode / size
	# unit change is a bit more complicated:
	col <- which(names == unit.mode)
	rows <- sapply(unit, FUN=function(x) which(names == x))

	unit.change <- sapply(rows, FUN=function(x){
	change <- conversion_matrix[x, col]
	if(length(change)==1) return(change) else return(NA)
	})


	if(coerce.unit.only){
	proposed.price <- price
	} else {
	if(require.all.for.conversion==TRUE){
	proposed.price <- pricequant.changesize.change*unit.change
	} else {
	quant.change[is.na(quant.change)] <- 1
	unit.change[is.na(unit.change)] <- 1
	size.change[is.na(size.change)] <- 1
	proposed.price <- pricequant.changesize.change*unit.change
	}
	}

	# good idx = those observations for which all units were transformable:
	good.idx <- which(!is.na(proposed.price))


	if(safety.first==TRUE){
	# only accept proposed changes if it brings you closer to the most.common mean:
	mu <- mean(price[paste(quantity, size, unit)==most.common])
	# sd <- sd(price[paste(quantity, size, unit)==most.common])
	makes.it.better <- abs(mu-price) > abs(mu-proposed.price)
	makes.it.better <- which(makes.it.better)
	} else {
	makes.it.better <- 1:length(price)
	}

	makes.it.better <- intersect(good.idx, makes.it.better)

	if(verbose==TRUE){
	print(paste("Altering", sum(price!=proposed.price, na.rm=T),
	"out of", length(price), "sizes and respective prices."))
	}

	price[makes.it.better] <- proposed.price[makes.it.better]

	unit.idx <- makes.it.better[which((unit.change!=1)[makes.it.better])]
	unit[unit.idx] <- unit.mode
	if(coerce.unit.only){
	# units have been changed so size should chnage to reflect this
	size[unit.idx] <- size[unit.idx]/unit.change[unit.idx]
	} else {
	size[makes.it.better] <- size.mode
	quantity[makes.it.better] <- quantity.mode
	unit[makes.it.better] <- unit.mode
	}

	spec.df$price <- price
	spec.df$size <- size
	spec.df$quantity <- quantity
	spec.df$size_unit <- unit

	if(only.return.most.common.unit==TRUE){
	if(!is.na(unit.mode)){
	idx.good.size_unit <- spec.df$size_unit == unit.mode
	idx.good.size_unit[is.na(idx.good.size_unit)] <- F
	} else {
	# idx.good.size_unit <- is.na(spec.df$size_unit)
	idx.good.size_unit <- rep(TRUE, nrow(spec.df))
	}

	if(verbose==T) print(paste("Removing", sum(!idx.good.size_unit), "obs whose units cannot be coerced."))
	spec.df <- spec.df[idx.good.size_unit, ]
	}

	# hmm. One to one increases in price due to size changes doesn't represent how the real
	# world sells stuff...
	return(spec.df)
	}