Created
January 13, 2016 07:09
-
-
Save marcovivero/228f29d7325140f8cd69 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 1. Load packages. Install any packages if necessary. | |
library(ggplot2) | |
library(MCMCpack) | |
library(rootSolve) | |
# 2. Extract data. | |
cv = read.csv("~/customerValue.csv", header = F)[, 1] | |
pcv = cv[which(cv > 0)] | |
pcvDF = data.frame(pcv) | |
names(pcvDF) = c("pcv") | |
# 3. Compute kernel density estimate. | |
### Select number of nodes to use for functional plotting. | |
numNodes = 100000 | |
### Compute density data frame. | |
pcvDensity = density(pcv, from = 0.0001, to = numNodes, n = numNodes) | |
pcvDensityDF = data.frame(pcvDensity$x, pcvDensity$y) | |
names(pcvDensityDF) = c("x", "y") | |
# 4. Compute proportion of users with positive CV scores: 0.03038587. | |
nPCV = length(pcv) | |
propPCV = nPCV / length(cv) | |
# 5. Create first plot. | |
pcvPlot <- ggplot(data = NULL) + | |
labs( | |
title = "Empirical Positive CV Score Distribution", | |
x ="Positive CV Score (Dollars)", | |
y = NULL) + | |
theme(plot.title = element_text(face = "bold", size = 25)) + | |
theme_bw() + | |
geom_histogram( | |
aes(x = pcv, y = ..density..), | |
data = pcvDF, | |
fill = "orange", | |
alpha = 1, | |
colour = "grey", | |
lwd = 0.2, | |
binwidth = 25) + | |
geom_histogram( | |
aes(x = pcv, y = ..density..), | |
data = pcvDF, | |
fill = "light blue", | |
alpha = 0.6, | |
colour = "grey", | |
lwd = 0.2, | |
binwidth = 50) + | |
geom_histogram( | |
aes(x = pcv, y = ..density..), | |
data = pcvDF, | |
fill = "grey", | |
alpha = 0.2, | |
colour = "grey", | |
lwd = 0.2, | |
binwidth = 100) + | |
geom_path( | |
aes(x = x, y = y), | |
data = pcvDensityDF, | |
linetype = 2, | |
colour = "black") + | |
scale_x_continuous(limits = c(0, numNodes)) + | |
coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) | |
pcvPlot | |
# 6. Compute MLE estimates for alpha and beta. | |
### Initialize MLE equation as a function. | |
igConstant = -(nPCV * log(mean(1 / pcv)) + sum(log(pcv))) | |
igMLE = Vectorize(function(x) { | |
igConstant + nPCV * (log(x) + digamma(x)) | |
}) | |
### Set up function for checking if parameter estimates are indeed maxima. | |
maximaCheck = function(alpha, beta) { | |
D = (nPCV^2 / beta^2) * (alpha * trigamma(alpha) - 1) | |
secDer = -nPCV * trigamma(alpha) | |
return(D > 0 & secDer < 0) | |
} | |
### Solve MLE equations. | |
alphaMLE = uniroot.all(igMLE, c(0.00001, 100)) | |
betaMLE = alphaMLE / mean(1 / pcv) | |
### Make sure we have found a maximum, should return TRUE. | |
maximaCheck(alphaMLE, betaMLE) | |
# 7. Compute EMV estimates for alpha and beta. | |
### Estimate expectation, mode, and variance from data. | |
e = mean(pcv) | |
m = pcvDensity$x[which.max(pcvDensity$y)] | |
v = var(pcv) | |
### Intialize EMV equation as a function. | |
igEMV = Vectorize(function(x) { | |
out = ((e - m)^2 / 4) * x^4 + | |
(-v) * x^3 + | |
((8 * v - (e - m)^2) / 2) * x^2 + | |
(-5 * v) * x + | |
((8 * v + (e - m)^2) / 4) | |
}) | |
### Solve EMV equations. | |
alphaEMV = uniroot.all(igEMV, c(2, 100)) | |
betaEMV = ((e - m) / 2) * (alphaEMV ^ 2 - 1) | |
# 8. Get densities. | |
### Helper function for obtaining inverse-gamma density given a pair of parameters. | |
getInvGammaDensity = function(alpha, beta) { | |
d = Vectorize(function(x) { | |
dinvgamma(x, shape = alpha, scale = beta) | |
}) | |
return(d) | |
} | |
fittedInvGammaMLE = getInvGammaDensity(alphaMLE, betaMLE) | |
fittedInvGammaEMVOne = getInvGammaDensity(alphaEMV[1], betaEMV[1]) | |
fittedInvGammaEMVTwo = getInvGammaDensity(alphaEMV[2], betaEMV[2]) | |
# 9. Plot densities. | |
### Helper function for obtaining data frame plots. | |
getFunctionDF = function(func) { | |
x = seq(0.00001, numNodes, by = 1) | |
df = data.frame(x, func(x)) | |
names(df) = c("x", "y") | |
return(df) | |
} | |
mleDF = getFunctionDF(fittedInvGammaMLE) | |
emvOneDF = getFunctionDF(fittedInvGammaEMVOne) | |
emvTwoDF = getFunctionDF(fittedInvGammaEMVTwo) | |
dPlot <- ggplot(data = NULL) + | |
labs( | |
title = "Fitted Densities for Positive CV Score Distribution", | |
x ="Positive CV Score (Dollars)", | |
y = NULL) + | |
theme(plot.title = element_text(face = "bold", size = 25)) + | |
theme_bw() + | |
geom_histogram( | |
aes(x = pcv, y = ..density..), | |
data = pcvDF, | |
fill = "light blue", | |
alpha = 0.5, | |
colour = "grey", | |
lwd = 0.2, | |
binwidth = 25) + | |
geom_path( | |
aes(x = x, y = y, colour = "1"), | |
data = pcvDensityDF, | |
lwd = 0.65, | |
linetype = 2) + | |
geom_path( | |
aes(x = x, y = y, colour = "2"), | |
data = mleDF, | |
lwd = 0.65, | |
linetype = 2) + | |
geom_path( | |
aes(x = x, y = y, colour = "3"), | |
data = emvOneDF, | |
lwd = 0.65, | |
linetype = 2) + | |
geom_path( | |
aes(x = x, y = y, colour = "4"), | |
data = emvTwoDF, | |
lwd = 0.65, | |
linetype = 2) + | |
scale_x_continuous(limits = c(0, numNodes)) + | |
coord_cartesian(xlim = c(0, 5000), ylim = c(0, 0.0125)) + | |
scale_colour_manual( | |
values=c("black", "red", "blue", "green"), | |
name="Density Curves", | |
breaks = c("1", "2", "3", "4"), | |
labels=c( | |
"Kernel Density", | |
"Inverse-Gamma MLE Density", | |
"Inverse-Gamma EMV 1 Density", | |
"Inverse-Gamma EMV 2 Density")) | |
dPlot | |
# 10. Fit mixture model. | |
### Helper function for extracting mixtures. | |
getMixture = function(pi) { | |
mixture = Vectorize(function(x) { | |
pi[1] * fittedInvGammaMLE(x) + | |
pi[2] * fittedInvGammaEMVOne(x) + | |
pi[3] * fittedInvGammaEMVTwo(x) | |
}) | |
return(mixture) | |
} | |
### Set up Dirichlet prior and sampler. | |
dirichletSampler = function(concentrations) {as.numeric(rdirichlet(1, concentrations))} | |
### Helper function for computing transition probability. | |
computeLogProbability = function(pi) { | |
mixture = getMixture(pi) | |
return(sum(log(mixture(pcv)))) | |
} | |
### Function for running Metropolis-Hastings algorithm. | |
runMetropolisHastings = function( | |
sampleSize, | |
initialWeights, | |
concentrations = rep(1, 3)) { | |
### Initialize variables. | |
piOld = initialWeights | |
logProbOld = computeLogProbability(piOld) | |
sample = vector("list", length = sampleSize) | |
for (i in 1:sampleSize) { | |
piCand = dirichletSampler(concentrations) | |
logProbCand = computeLogProbability(piCand) | |
r = min(exp(logProbCand - logProbOld), 1) | |
if (runif(1) < r) { | |
sample[[i]] = piCand | |
piOld = piCand | |
logProbOld = logProbCand | |
} else { | |
sample[[i]] = piOld | |
} | |
print(paste0("Sample ", i, ", transition probability: ", r, | |
", sample: ", sample[[i]][1], ", ", sample[[i]][2], | |
", candidates: ", piCand[1], ", ", piCand[2])) | |
} | |
return(t(sapply(sample, as.numeric))) | |
} | |
### Start Markov Chain at different simplex vertices and [1/3, 1/3, 1/3]. | |
### I recommed you just use printed report to inspect chain at first, as there is very | |
### high auto-correlation, and restart chain after modifying concentrations | |
### to change the sampling envelope. | |
piStart = c(0.801250452048145, 0.188660534125485, 0.01008901) | |
concentrations = c(0.80, 0.19, 0.01)* 1500 | |
weightSamples = runMetropolisHastings(150, piStart, concentrations) | |
sampleMatrix = t(sapply(weightSamples, as.numeric)) | |
weights = colMeans(sampleMatrix) # These are roughly the mixing proportions. | |
# 11. Plot mixture density. | |
mixtureDF = getFunctionDF(getMixture(weights)) | |
mPlot <- ggplot(data = NULL) + | |
labs( | |
title = "Mixture Density for Positive CV Score Distribution", | |
x ="Positive CV Score (Dollars)", | |
y = NULL) + | |
theme(plot.title = element_text(face = "bold", size = 25)) + | |
theme_bw() + | |
geom_histogram( | |
aes(x = pcv, y = ..density..), | |
data = pcvDF, | |
fill = "light blue", | |
alpha = 0.5, | |
colour = "grey", | |
lwd = 0.2, | |
binwidth = 25) + | |
geom_path( | |
aes(x = x, y = y, colour = "1"), | |
data = pcvDensityDF, | |
lwd = 0.65, | |
linetype = 2) + | |
geom_path( | |
aes(x = x, y = y, colour = "2"), | |
data = mixtureDF, | |
lwd = 0.65, | |
linetype = 2) + | |
scale_x_continuous(limits = c(0, numNodes)) + | |
coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) + | |
scale_colour_manual( | |
values=c("black", "red"), | |
name="Density Curves", | |
breaks = c("1", "2"), | |
labels=c( | |
"Kernel Density", | |
"Inverse-Gamma Mixture Density")) | |
mPlot | |
# 12. Compute CV cutoff. | |
quantileFunc = function(p, func) { | |
output = Vectorize(function(x) { | |
integrate(func, 0, x)$value - p | |
}) | |
return(output) | |
} | |
cvCutoff = uniroot.all(quantileFunc(0.95, getMixture(weights)), c(0.0001, 1000)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment