hillarysanders · December 28, 2015 03:59
diff --git a/hackbright_play_with_data b/hackbright_play_with_data
 ######################################################################
 # hackbright stuffs 
 # Premise
 ######################################################################
 # Author: Hillary Sanders
 ######################################################################
 ######################################################################

 # get data here: https://data.premise.com/


 # There are a few main types of objects in R:
 # vectors (e.g. c(1,2,3)), matrices, lists, and dataframes.
 # dataframes are very common, and is mostly what we're going to 
 # be playing with below. A dataframe is like a matrix, in that
 # is has two dimensions, but each column can be a specific type 
 # of value: e.g. numeric or character.

 ########################################################
 ########################################################
 # super basic intro to R
 x <- c(1,2,3)
 y <- c('a', 'b', 'c')
 df <- data.frame(x,y)
 print(df)
 class(df)
 class(df[,1])
 class(df[,2])
 class(as.character(df[,2]))

 # you can access values of a dataframe in multiple ways:
 df[1,1]
 df[,1]
 df[ , 'y']
 df$y
 df$x[1]

 # periods "." in R mean little. If you see a period, it doesn't mean
 # you're accessing some function. e.g.:
 i.am.an.object <- 1:10
 print(i.am.an.object)
 sum(i.am.an.object)

 # a function is written like this
 foo <- function(x=4){ print(c('bar', x))}
 foo()
 foo(x=1000)

 # plotting in R is fun:
 plot(1:100, (1:100)^3, col=rainbow(100, alpha=.1), pch=19, cex=50, xlab='', ylab='')
 points(1:100, (100:1)^3, col=rainbow(100, alpha=.05), pch=19, cex=seq(50, 1, length=80))
 text('This can get addictive', x=50, y=75^3, cex=3)
 ########################################################
 ########################################################

 ################################
 # Size quantity normalization:

 setwd('~/Desktop/PREMISE/')
 source('Hillary_Premise/utils/env.R')

 # read in some data:
 br <- read.csv('Hillary_Premise/data/br_raw.csv')
 spec.df <- br[br$spec_name=='Cream', ]


 # what do the prices look like?
 hist(spec.df$price, breaks=50, col='#3090aa40', xlab='', 
     main=spec.df$spec_name[1], cex.main=4)


 # everything past this red line is likely an outlier, we'll deal with this later.
 # lwd = line thickness, lty = line type (2 = dotted), v := 'vertical', col= color.
 abline(v=6, col='red', lty=2, lwd=2)

 # inspect some metadata:
 table(as.character(spec.df$size))
 table(as.character(spec.df$size_unit))
 table(as.character(spec.df$quantity))

 # paste size and size_unit together, and analyze that:
 size_and_unit <- paste(spec.df$size, spec.df$size_unit)
 table(size_and_unit)
 spec.df$size_and_unit <- size_and_unit 

 # visualize this:
 barplot(sort(table(size_and_unit)),
        cex.names=.6, col=rainbow(10, .5, .8), main=spec.df$spec_name[1],
        cex.main=4)

 # this is a function that I made, might put it up on a gist if people want to check it out.
 sizes <- plot.effect2(spec.df, varname='size_and_unit')

 # placename inspection
 spec.df <- cluster.metadata(spec.df, var='placename')
 places <- plot.effect2(spec.df, varname='placename')

 # product name inspection
 spec.df <- cluster.metadata(spec.df, var='product_name')
 products <- plot.effect2(spec.df, varname='product_name', cex.labels=.8)


 spec.df.standardized <- unit.standardizer(spec.df, verbose=T)
 table(spec.df.standardized$quantity)
 table(spec.df.standardized$size_unit)

 # bi-modal
 hist(spec.df$price, breaks=50, col='#4090aa')
 # unimodal - better!
 hist(spec.df.standardized$price, breaks=50, col='#4090aa')

 # was standard deviation decreased?
 print(sd(spec.df$price))
 print(sd(spec.df.standardized$price))
 # yes! That's good.

 ################################################################
 ################################################################
 # outlier detection:
 # a simple way to clean out crazay observations

 spec.df.clean <- trimmed.normal(spec.df.standardized)
 hist(spec.df.clean$price, breaks=50, col='#40aa6080', xlab='price', main=spec.df$spec_name[1])

 # make and plot the time series:
 ts <- get.simple.ts(spec.df.clean, demean=F, obs.window.median=10)
 plot.ts(ts, spec.df.clean, ylim=c(1,3))

 # Now plot it with the median prices plotted instead.
 spec.df.clean$price <- get.window.median(spec.df.clean$price, n=10)
 plot.ts(ts, spec.df.clean, ylim=c(1,3))


 ################################################################
 ################################################################
 # here's an example of what you can eventually built up to:
 # a food staples index for an entire country.
 uuids <- read.csv('~/Desktop/uuids.csv')
 inflation <- get.offline.cpi(all.data=br, max=2, min=.5, uuids=uuids,
                             min.obs.smooth=100, carry.forward=F,
                             clean=T, trim1=.1, trim2=.05, z1=5, z2=4,
                             obs.window.median=3,
                             country='br')
 objects.to.global.env(inflation)
 plot.offline.cpi(plot.elements=T, zoom=.2)
 plot.offline.cpi(plot.elements=T, zoom=4)
 abline(h=100, col='black', lty=2)
 ################################################################
 ################################################################


 ################################################################
 ################################################################
 # premise places - kmeans algorithm

 usa.places <- read.csv('~/Desktop/premise_places/us_places_extra_refined3.csv')
 head(usa.places)
 foobar <- usa.places[usa.places$center1=='1', ]
 hist(foobar$loc_accuracy, breaks=90, col='cornflowerblue')

 geo <- cluster.by.geo(foobar, guess.k.by.placenames=F, placename.tune=T, mainplaces.proportion=.5,
                       get.k.by='user', zoom=.1, loc.accuracy.cutoff=1500)
 ################################################################
 ################################################################
	######################################################################
	# hackbright stuffs
	# Premise
	######################################################################
	# Author: Hillary Sanders
	######################################################################
	######################################################################

	# get data here: https://data.premise.com/


	# There are a few main types of objects in R:
	# vectors (e.g. c(1,2,3)), matrices, lists, and dataframes.
	# dataframes are very common, and is mostly what we're going to
	# be playing with below. A dataframe is like a matrix, in that
	# is has two dimensions, but each column can be a specific type
	# of value: e.g. numeric or character.

	########################################################
	########################################################
	# super basic intro to R
	x <- c(1,2,3)
	y <- c('a', 'b', 'c')
	df <- data.frame(x,y)
	print(df)
	class(df)
	class(df[,1])
	class(df[,2])
	class(as.character(df[,2]))

	# you can access values of a dataframe in multiple ways:
	df[1,1]
	df[,1]
	df[ , 'y']
	df$y
	df$x[1]

	# periods "." in R mean little. If you see a period, it doesn't mean
	# you're accessing some function. e.g.:
	i.am.an.object <- 1:10
	print(i.am.an.object)
	sum(i.am.an.object)

	# a function is written like this
	foo <- function(x=4){ print(c('bar', x))}
	foo()
	foo(x=1000)

	# plotting in R is fun:
	plot(1:100, (1:100)^3, col=rainbow(100, alpha=.1), pch=19, cex=50, xlab='', ylab='')
	points(1:100, (100:1)^3, col=rainbow(100, alpha=.05), pch=19, cex=seq(50, 1, length=80))
	text('This can get addictive', x=50, y=75^3, cex=3)
	########################################################
	########################################################

	################################
	# Size quantity normalization:

	setwd('~/Desktop/PREMISE/')
	source('Hillary_Premise/utils/env.R')

	# read in some data:
	br <- read.csv('Hillary_Premise/data/br_raw.csv')
	spec.df <- br[br$spec_name=='Cream', ]


	# what do the prices look like?
	hist(spec.df$price, breaks=50, col='#3090aa40', xlab='',
	main=spec.df$spec_name[1], cex.main=4)


	# everything past this red line is likely an outlier, we'll deal with this later.
	# lwd = line thickness, lty = line type (2 = dotted), v := 'vertical', col= color.
	abline(v=6, col='red', lty=2, lwd=2)

	# inspect some metadata:
	table(as.character(spec.df$size))
	table(as.character(spec.df$size_unit))
	table(as.character(spec.df$quantity))

	# paste size and size_unit together, and analyze that:
	size_and_unit <- paste(spec.df$size, spec.df$size_unit)
	table(size_and_unit)
	spec.df$size_and_unit <- size_and_unit

	# visualize this:
	barplot(sort(table(size_and_unit)),
	cex.names=.6, col=rainbow(10, .5, .8), main=spec.df$spec_name[1],
	cex.main=4)

	# this is a function that I made, might put it up on a gist if people want to check it out.
	sizes <- plot.effect2(spec.df, varname='size_and_unit')

	# placename inspection
	spec.df <- cluster.metadata(spec.df, var='placename')
	places <- plot.effect2(spec.df, varname='placename')

	# product name inspection
	spec.df <- cluster.metadata(spec.df, var='product_name')
	products <- plot.effect2(spec.df, varname='product_name', cex.labels=.8)


	spec.df.standardized <- unit.standardizer(spec.df, verbose=T)
	table(spec.df.standardized$quantity)
	table(spec.df.standardized$size_unit)

	# bi-modal
	hist(spec.df$price, breaks=50, col='#4090aa')
	# unimodal - better!
	hist(spec.df.standardized$price, breaks=50, col='#4090aa')

	# was standard deviation decreased?
	print(sd(spec.df$price))
	print(sd(spec.df.standardized$price))
	# yes! That's good.

	################################################################
	################################################################
	# outlier detection:
	# a simple way to clean out crazay observations

	spec.df.clean <- trimmed.normal(spec.df.standardized)
	hist(spec.df.clean$price, breaks=50, col='#40aa6080', xlab='price', main=spec.df$spec_name[1])

	# make and plot the time series:
	ts <- get.simple.ts(spec.df.clean, demean=F, obs.window.median=10)
	plot.ts(ts, spec.df.clean, ylim=c(1,3))

	# Now plot it with the median prices plotted instead.
	spec.df.clean$price <- get.window.median(spec.df.clean$price, n=10)
	plot.ts(ts, spec.df.clean, ylim=c(1,3))


	################################################################
	################################################################
	# here's an example of what you can eventually built up to:
	# a food staples index for an entire country.
	uuids <- read.csv('~/Desktop/uuids.csv')
	inflation <- get.offline.cpi(all.data=br, max=2, min=.5, uuids=uuids,
	min.obs.smooth=100, carry.forward=F,
	clean=T, trim1=.1, trim2=.05, z1=5, z2=4,
	obs.window.median=3,
	country='br')
	objects.to.global.env(inflation)
	plot.offline.cpi(plot.elements=T, zoom=.2)
	plot.offline.cpi(plot.elements=T, zoom=4)
	abline(h=100, col='black', lty=2)
	################################################################
	################################################################


	################################################################
	################################################################
	# premise places - kmeans algorithm

	usa.places <- read.csv('~/Desktop/premise_places/us_places_extra_refined3.csv')
	head(usa.places)
	foobar <- usa.places[usa.places$center1=='1', ]
	hist(foobar$loc_accuracy, breaks=90, col='cornflowerblue')

	geo <- cluster.by.geo(foobar, guess.k.by.placenames=F, placename.tune=T, mainplaces.proportion=.5,
	get.k.by='user', zoom=.1, loc.accuracy.cutoff=1500)
	################################################################
	################################################################