Last active
December 28, 2015 03:59
-
-
Save hillarysanders/7439589 to your computer and use it in GitHub Desktop.
hackbright - play with some data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
###################################################################### | |
# hackbright stuffs | |
# Premise | |
###################################################################### | |
# Author: Hillary Sanders | |
###################################################################### | |
###################################################################### | |
# get data here: https://data.premise.com/ | |
# There are a few main types of objects in R: | |
# vectors (e.g. c(1,2,3)), matrices, lists, and dataframes. | |
# dataframes are very common, and is mostly what we're going to | |
# be playing with below. A dataframe is like a matrix, in that | |
# is has two dimensions, but each column can be a specific type | |
# of value: e.g. numeric or character. | |
######################################################## | |
######################################################## | |
# super basic intro to R | |
x <- c(1,2,3) | |
y <- c('a', 'b', 'c') | |
df <- data.frame(x,y) | |
print(df) | |
class(df) | |
class(df[,1]) | |
class(df[,2]) | |
class(as.character(df[,2])) | |
# you can access values of a dataframe in multiple ways: | |
df[1,1] | |
df[,1] | |
df[ , 'y'] | |
df$y | |
df$x[1] | |
# periods "." in R mean little. If you see a period, it doesn't mean | |
# you're accessing some function. e.g.: | |
i.am.an.object <- 1:10 | |
print(i.am.an.object) | |
sum(i.am.an.object) | |
# a function is written like this | |
foo <- function(x=4){ print(c('bar', x))} | |
foo() | |
foo(x=1000) | |
# plotting in R is fun: | |
plot(1:100, (1:100)^3, col=rainbow(100, alpha=.1), pch=19, cex=50, xlab='', ylab='') | |
points(1:100, (100:1)^3, col=rainbow(100, alpha=.05), pch=19, cex=seq(50, 1, length=80)) | |
text('This can get addictive', x=50, y=75^3, cex=3) | |
######################################################## | |
######################################################## | |
################################ | |
# Size quantity normalization: | |
setwd('~/Desktop/PREMISE/') | |
source('Hillary_Premise/utils/env.R') | |
# read in some data: | |
br <- read.csv('Hillary_Premise/data/br_raw.csv') | |
spec.df <- br[br$spec_name=='Cream', ] | |
# what do the prices look like? | |
hist(spec.df$price, breaks=50, col='#3090aa40', xlab='', | |
main=spec.df$spec_name[1], cex.main=4) | |
# everything past this red line is likely an outlier, we'll deal with this later. | |
# lwd = line thickness, lty = line type (2 = dotted), v := 'vertical', col= color. | |
abline(v=6, col='red', lty=2, lwd=2) | |
# inspect some metadata: | |
table(as.character(spec.df$size)) | |
table(as.character(spec.df$size_unit)) | |
table(as.character(spec.df$quantity)) | |
# paste size and size_unit together, and analyze that: | |
size_and_unit <- paste(spec.df$size, spec.df$size_unit) | |
table(size_and_unit) | |
spec.df$size_and_unit <- size_and_unit | |
# visualize this: | |
barplot(sort(table(size_and_unit)), | |
cex.names=.6, col=rainbow(10, .5, .8), main=spec.df$spec_name[1], | |
cex.main=4) | |
# this is a function that I made, might put it up on a gist if people want to check it out. | |
sizes <- plot.effect2(spec.df, varname='size_and_unit') | |
# placename inspection | |
spec.df <- cluster.metadata(spec.df, var='placename') | |
places <- plot.effect2(spec.df, varname='placename') | |
# product name inspection | |
spec.df <- cluster.metadata(spec.df, var='product_name') | |
products <- plot.effect2(spec.df, varname='product_name', cex.labels=.8) | |
spec.df.standardized <- unit.standardizer(spec.df, verbose=T) | |
table(spec.df.standardized$quantity) | |
table(spec.df.standardized$size_unit) | |
# bi-modal | |
hist(spec.df$price, breaks=50, col='#4090aa') | |
# unimodal - better! | |
hist(spec.df.standardized$price, breaks=50, col='#4090aa') | |
# was standard deviation decreased? | |
print(sd(spec.df$price)) | |
print(sd(spec.df.standardized$price)) | |
# yes! That's good. | |
################################################################ | |
################################################################ | |
# outlier detection: | |
# a simple way to clean out crazay observations | |
spec.df.clean <- trimmed.normal(spec.df.standardized) | |
hist(spec.df.clean$price, breaks=50, col='#40aa6080', xlab='price', main=spec.df$spec_name[1]) | |
# make and plot the time series: | |
ts <- get.simple.ts(spec.df.clean, demean=F, obs.window.median=10) | |
plot.ts(ts, spec.df.clean, ylim=c(1,3)) | |
# Now plot it with the median prices plotted instead. | |
spec.df.clean$price <- get.window.median(spec.df.clean$price, n=10) | |
plot.ts(ts, spec.df.clean, ylim=c(1,3)) | |
################################################################ | |
################################################################ | |
# here's an example of what you can eventually built up to: | |
# a food staples index for an entire country. | |
uuids <- read.csv('~/Desktop/uuids.csv') | |
inflation <- get.offline.cpi(all.data=br, max=2, min=.5, uuids=uuids, | |
min.obs.smooth=100, carry.forward=F, | |
clean=T, trim1=.1, trim2=.05, z1=5, z2=4, | |
obs.window.median=3, | |
country='br') | |
objects.to.global.env(inflation) | |
plot.offline.cpi(plot.elements=T, zoom=.2) | |
plot.offline.cpi(plot.elements=T, zoom=4) | |
abline(h=100, col='black', lty=2) | |
################################################################ | |
################################################################ | |
################################################################ | |
################################################################ | |
# premise places - kmeans algorithm | |
usa.places <- read.csv('~/Desktop/premise_places/us_places_extra_refined3.csv') | |
head(usa.places) | |
foobar <- usa.places[usa.places$center1=='1', ] | |
hist(foobar$loc_accuracy, breaks=90, col='cornflowerblue') | |
geo <- cluster.by.geo(foobar, guess.k.by.placenames=F, placename.tune=T, mainplaces.proportion=.5, | |
get.k.by='user', zoom=.1, loc.accuracy.cutoff=1500) | |
################################################################ | |
################################################################ | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment