marcovivero · January 13, 2016 07:09
diff --git a/cvCutoff.R b/cvCutoff.R
 # 1. Load packages. Install any packages if necessary.
 library(ggplot2)
 library(MCMCpack)
 library(rootSolve)

 # 2. Extract data.
 cv = read.csv("~/customerValue.csv", header = F)[, 1]
 pcv = cv[which(cv > 0)]
 pcvDF = data.frame(pcv)
 names(pcvDF) = c("pcv")

 # 3. Compute kernel density estimate.

 ### Select number of nodes to use for functional plotting.
 numNodes = 100000

 ### Compute density data frame.
 pcvDensity = density(pcv, from = 0.0001, to = numNodes, n = numNodes)
 pcvDensityDF = data.frame(pcvDensity$x, pcvDensity$y)
 names(pcvDensityDF) = c("x", "y")


 # 4. Compute proportion of users with positive CV scores: 0.03038587.
 nPCV = length(pcv)
 propPCV = nPCV / length(cv)

 # 5. Create first plot.

 pcvPlot <- ggplot(data = NULL) + 
    labs(
      title = "Empirical Positive CV Score Distribution", 
      x ="Positive CV Score (Dollars)", 
      y = NULL) +
    theme(plot.title = element_text(face = "bold", size = 25)) +
    theme_bw() + 
    geom_histogram(
      aes(x = pcv, y = ..density..),  
      data = pcvDF,
      fill = "orange", 
      alpha = 1, 
      colour = "grey",
      lwd = 0.2,
      binwidth = 25) +
    geom_histogram(
      aes(x = pcv, y = ..density..),  
      data = pcvDF, 
      fill = "light blue", 
      alpha = 0.6, 
      colour = "grey",
      lwd = 0.2,
      binwidth = 50) +
    geom_histogram(
      aes(x = pcv, y = ..density..),  
      data = pcvDF, 
      fill = "grey", 
      alpha = 0.2, 
      colour = "grey",
      lwd = 0.2,
      binwidth = 100) +
    geom_path(
      aes(x = x, y = y),
      data = pcvDensityDF,
      linetype = 2, 
      colour = "black") + 
    scale_x_continuous(limits = c(0, numNodes)) + 
    coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125))
 pcvPlot

 # 6. Compute MLE estimates for alpha and beta.

 ### Initialize MLE equation as a function.
 igConstant = -(nPCV * log(mean(1 / pcv)) + sum(log(pcv)))
 igMLE = Vectorize(function(x) {
  igConstant + nPCV * (log(x) + digamma(x))
 })

 ### Set up function for checking if parameter estimates are indeed maxima.
 maximaCheck = function(alpha, beta) {
  D = (nPCV^2 / beta^2) * (alpha * trigamma(alpha) - 1)
  secDer = -nPCV * trigamma(alpha)
  
  return(D > 0 & secDer < 0)
 }

 ### Solve MLE equations.
 alphaMLE = uniroot.all(igMLE, c(0.00001, 100))
 betaMLE = alphaMLE / mean(1 / pcv)

 ### Make sure we have found a maximum, should return TRUE.
 maximaCheck(alphaMLE, betaMLE)

 # 7. Compute EMV estimates for alpha and beta.

 ### Estimate expectation, mode, and variance from data.
 e = mean(pcv)
 m = pcvDensity$x[which.max(pcvDensity$y)]
 v = var(pcv)

 ### Intialize EMV equation as a function.
 igEMV = Vectorize(function(x) {
  out = ((e - m)^2 / 4) * x^4 +
    (-v) * x^3 + 
    ((8 * v - (e - m)^2) / 2) * x^2 + 
    (-5 * v) * x +
    ((8 * v + (e - m)^2) / 4) 
 })

 ### Solve EMV equations.
 alphaEMV = uniroot.all(igEMV, c(2, 100))
 betaEMV = ((e - m) / 2) * (alphaEMV ^ 2 - 1)

 # 8. Get densities.

 ### Helper function for obtaining inverse-gamma density given a pair of parameters.
 getInvGammaDensity = function(alpha, beta) {
  d = Vectorize(function(x) {
    dinvgamma(x, shape = alpha, scale = beta)
  })
  return(d)
 }

 fittedInvGammaMLE = getInvGammaDensity(alphaMLE, betaMLE)
 fittedInvGammaEMVOne = getInvGammaDensity(alphaEMV[1], betaEMV[1])
 fittedInvGammaEMVTwo = getInvGammaDensity(alphaEMV[2], betaEMV[2])

 # 9. Plot densities.

 ### Helper function for obtaining data frame plots.
 getFunctionDF = function(func) {
  x = seq(0.00001, numNodes, by = 1)
  df = data.frame(x, func(x))
  names(df) = c("x", "y")
  
  return(df)
 }

 mleDF = getFunctionDF(fittedInvGammaMLE)
 emvOneDF = getFunctionDF(fittedInvGammaEMVOne)
 emvTwoDF = getFunctionDF(fittedInvGammaEMVTwo)


 dPlot <- ggplot(data = NULL) + 
  labs(
    title = "Fitted Densities for Positive CV Score Distribution", 
    x ="Positive CV Score (Dollars)", 
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() + 
  geom_histogram(
    aes(x = pcv, y = ..density..),  
    data = pcvDF,
    fill = "light blue", 
    alpha = 0.5, 
    colour = "grey",
    lwd = 0.2,
    binwidth = 25) +
  geom_path(
    aes(x = x, y = y, colour = "1"),
    data = pcvDensityDF,
    lwd = 0.65,
    linetype = 2) + 
  geom_path(
    aes(x = x, y = y, colour = "2"),
    data = mleDF,
    lwd = 0.65,
    linetype = 2) + 
  geom_path(
    aes(x = x, y = y, colour = "3"),
    data = emvOneDF,
    lwd = 0.65,
    linetype = 2) + 
  geom_path(
    aes(x = x, y = y, colour = "4"),
    data = emvTwoDF,
    lwd = 0.65,
    linetype = 2) + 
  scale_x_continuous(limits = c(0, numNodes)) + 
  coord_cartesian(xlim = c(0, 5000), ylim = c(0, 0.0125)) +
  scale_colour_manual(
    values=c("black", "red", "blue", "green"),
    name="Density Curves",
    breaks = c("1", "2", "3", "4"),
    labels=c(
      "Kernel Density",
      "Inverse-Gamma  MLE Density",
      "Inverse-Gamma EMV 1 Density",
      "Inverse-Gamma EMV 2 Density"))
 dPlot

 # 10. Fit mixture model.

 ### Helper function for extracting mixtures.
 getMixture = function(pi) {
  mixture = Vectorize(function(x) {
    pi[1] * fittedInvGammaMLE(x) + 
    pi[2] * fittedInvGammaEMVOne(x) + 
    pi[3] * fittedInvGammaEMVTwo(x)
  })
  
  return(mixture)
 }

 ### Set up Dirichlet prior and sampler.
 dirichletSampler = function(concentrations) {as.numeric(rdirichlet(1, concentrations))}

 ### Helper function for computing transition probability.
 computeLogProbability = function(pi) {
  mixture = getMixture(pi)
  
  return(sum(log(mixture(pcv))))
 }

 ### Function for running Metropolis-Hastings algorithm.
 runMetropolisHastings = function(
  sampleSize, 
  initialWeights,
  concentrations = rep(1, 3)) {
  ### Initialize variables.
  piOld = initialWeights
  logProbOld = computeLogProbability(piOld)
  sample = vector("list", length = sampleSize)
  
  for (i in 1:sampleSize) {
    piCand = dirichletSampler(concentrations)
    logProbCand = computeLogProbability(piCand)
    r = min(exp(logProbCand - logProbOld), 1)
    if (runif(1) < r) {
      sample[[i]] = piCand
      piOld = piCand
      logProbOld = logProbCand
    } else {
      sample[[i]] = piOld
    }
    
    print(paste0("Sample ", i, ", transition probability: ", r, 
                 ", sample: ", sample[[i]][1], ", ", sample[[i]][2],
                 ", candidates: ", piCand[1], ", ", piCand[2]))
  }
  
  return(t(sapply(sample, as.numeric)))
 }

 ### Start Markov Chain at different simplex vertices and [1/3, 1/3, 1/3].
 ### I recommed you just use printed report to inspect chain at first, as there is very
 ### high auto-correlation, and restart chain after modifying concentrations
 ### to change the sampling envelope.


 piStart = c(0.801250452048145, 0.188660534125485, 0.01008901) 
 concentrations = c(0.80, 0.19, 0.01)* 1500
 weightSamples = runMetropolisHastings(150, piStart, concentrations)
 sampleMatrix = t(sapply(weightSamples, as.numeric))
 weights = colMeans(sampleMatrix) # These are roughly the mixing proportions.

 # 11. Plot mixture density.

 mixtureDF = getFunctionDF(getMixture(weights))

 mPlot <- ggplot(data = NULL) + 
  labs(
    title = "Mixture Density for Positive CV Score Distribution", 
    x ="Positive CV Score (Dollars)", 
    y = NULL) +
  theme(plot.title = element_text(face = "bold", size = 25)) +
  theme_bw() + 
  geom_histogram(
    aes(x = pcv, y = ..density..),  
    data = pcvDF,
    fill = "light blue", 
    alpha = 0.5, 
    colour = "grey",
    lwd = 0.2,
    binwidth = 25) +
  geom_path(
    aes(x = x, y = y, colour = "1"),
    data = pcvDensityDF,
    lwd = 0.65,
    linetype = 2) + 
  geom_path(
    aes(x = x, y = y, colour = "2"),
    data = mixtureDF,
    lwd = 0.65,
    linetype = 2) + 
  scale_x_continuous(limits = c(0, numNodes)) + 
  coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) +
  scale_colour_manual(
    values=c("black", "red"),
    name="Density Curves",
    breaks = c("1", "2"),
    labels=c(
      "Kernel Density",
      "Inverse-Gamma Mixture Density"))
 mPlot

 # 12. Compute CV cutoff.

 quantileFunc = function(p, func) {
  output = Vectorize(function(x) {
    integrate(func, 0, x)$value - p
  })
  
  return(output)
 }

 cvCutoff = uniroot.all(quantileFunc(0.95, getMixture(weights)), c(0.0001, 1000))
	# 1. Load packages. Install any packages if necessary.
	library(ggplot2)
	library(MCMCpack)
	library(rootSolve)

	# 2. Extract data.
	cv = read.csv("~/customerValue.csv", header = F)[, 1]
	pcv = cv[which(cv > 0)]
	pcvDF = data.frame(pcv)
	names(pcvDF) = c("pcv")

	# 3. Compute kernel density estimate.

	### Select number of nodes to use for functional plotting.
	numNodes = 100000

	### Compute density data frame.
	pcvDensity = density(pcv, from = 0.0001, to = numNodes, n = numNodes)
	pcvDensityDF = data.frame(pcvDensity$x, pcvDensity$y)
	names(pcvDensityDF) = c("x", "y")


	# 4. Compute proportion of users with positive CV scores: 0.03038587.
	nPCV = length(pcv)
	propPCV = nPCV / length(cv)

	# 5. Create first plot.

	pcvPlot <- ggplot(data = NULL) +
	labs(
	title = "Empirical Positive CV Score Distribution",
	x ="Positive CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "orange",
	alpha = 1,
	colour = "grey",
	lwd = 0.2,
	binwidth = 25) +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "light blue",
	alpha = 0.6,
	colour = "grey",
	lwd = 0.2,
	binwidth = 50) +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "grey",
	alpha = 0.2,
	colour = "grey",
	lwd = 0.2,
	binwidth = 100) +
	geom_path(
	aes(x = x, y = y),
	data = pcvDensityDF,
	linetype = 2,
	colour = "black") +
	scale_x_continuous(limits = c(0, numNodes)) +
	coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125))
	pcvPlot

	# 6. Compute MLE estimates for alpha and beta.

	### Initialize MLE equation as a function.
	igConstant = -(nPCV * log(mean(1 / pcv)) + sum(log(pcv)))
	igMLE = Vectorize(function(x) {
	igConstant + nPCV * (log(x) + digamma(x))
	})

	### Set up function for checking if parameter estimates are indeed maxima.
	maximaCheck = function(alpha, beta) {
	D = (nPCV^2 / beta^2) * (alpha * trigamma(alpha) - 1)
	secDer = -nPCV * trigamma(alpha)

	return(D > 0 & secDer < 0)
	}

	### Solve MLE equations.
	alphaMLE = uniroot.all(igMLE, c(0.00001, 100))
	betaMLE = alphaMLE / mean(1 / pcv)

	### Make sure we have found a maximum, should return TRUE.
	maximaCheck(alphaMLE, betaMLE)

	# 7. Compute EMV estimates for alpha and beta.

	### Estimate expectation, mode, and variance from data.
	e = mean(pcv)
	m = pcvDensity$x[which.max(pcvDensity$y)]
	v = var(pcv)

	### Intialize EMV equation as a function.
	igEMV = Vectorize(function(x) {
	out = ((e - m)^2 / 4) * x^4 +
	(-v) * x^3 +
	((8 * v - (e - m)^2) / 2) * x^2 +
	(-5 * v) * x +
	((8 * v + (e - m)^2) / 4)
	})

	### Solve EMV equations.
	alphaEMV = uniroot.all(igEMV, c(2, 100))
	betaEMV = ((e - m) / 2) * (alphaEMV ^ 2 - 1)

	# 8. Get densities.

	### Helper function for obtaining inverse-gamma density given a pair of parameters.
	getInvGammaDensity = function(alpha, beta) {
	d = Vectorize(function(x) {
	dinvgamma(x, shape = alpha, scale = beta)
	})
	return(d)
	}

	fittedInvGammaMLE = getInvGammaDensity(alphaMLE, betaMLE)
	fittedInvGammaEMVOne = getInvGammaDensity(alphaEMV[1], betaEMV[1])
	fittedInvGammaEMVTwo = getInvGammaDensity(alphaEMV[2], betaEMV[2])

	# 9. Plot densities.

	### Helper function for obtaining data frame plots.
	getFunctionDF = function(func) {
	x = seq(0.00001, numNodes, by = 1)
	df = data.frame(x, func(x))
	names(df) = c("x", "y")

	return(df)
	}

	mleDF = getFunctionDF(fittedInvGammaMLE)
	emvOneDF = getFunctionDF(fittedInvGammaEMVOne)
	emvTwoDF = getFunctionDF(fittedInvGammaEMVTwo)


	dPlot <- ggplot(data = NULL) +
	labs(
	title = "Fitted Densities for Positive CV Score Distribution",
	x ="Positive CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "light blue",
	alpha = 0.5,
	colour = "grey",
	lwd = 0.2,
	binwidth = 25) +
	geom_path(
	aes(x = x, y = y, colour = "1"),
	data = pcvDensityDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "2"),
	data = mleDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "3"),
	data = emvOneDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "4"),
	data = emvTwoDF,
	lwd = 0.65,
	linetype = 2) +
	scale_x_continuous(limits = c(0, numNodes)) +
	coord_cartesian(xlim = c(0, 5000), ylim = c(0, 0.0125)) +
	scale_colour_manual(
	values=c("black", "red", "blue", "green"),
	name="Density Curves",
	breaks = c("1", "2", "3", "4"),
	labels=c(
	"Kernel Density",
	"Inverse-Gamma MLE Density",
	"Inverse-Gamma EMV 1 Density",
	"Inverse-Gamma EMV 2 Density"))
	dPlot

	# 10. Fit mixture model.

	### Helper function for extracting mixtures.
	getMixture = function(pi) {
	mixture = Vectorize(function(x) {
	pi[1] * fittedInvGammaMLE(x) +
	pi[2] * fittedInvGammaEMVOne(x) +
	pi[3] * fittedInvGammaEMVTwo(x)
	})

	return(mixture)
	}

	### Set up Dirichlet prior and sampler.
	dirichletSampler = function(concentrations) {as.numeric(rdirichlet(1, concentrations))}

	### Helper function for computing transition probability.
	computeLogProbability = function(pi) {
	mixture = getMixture(pi)

	return(sum(log(mixture(pcv))))
	}

	### Function for running Metropolis-Hastings algorithm.
	runMetropolisHastings = function(
	sampleSize,
	initialWeights,
	concentrations = rep(1, 3)) {
	### Initialize variables.
	piOld = initialWeights
	logProbOld = computeLogProbability(piOld)
	sample = vector("list", length = sampleSize)

	for (i in 1:sampleSize) {
	piCand = dirichletSampler(concentrations)
	logProbCand = computeLogProbability(piCand)
	r = min(exp(logProbCand - logProbOld), 1)
	if (runif(1) < r) {
	sample[[i]] = piCand
	piOld = piCand
	logProbOld = logProbCand
	} else {
	sample[[i]] = piOld
	}

	print(paste0("Sample ", i, ", transition probability: ", r,
	", sample: ", sample[[i]][1], ", ", sample[[i]][2],
	", candidates: ", piCand[1], ", ", piCand[2]))
	}

	return(t(sapply(sample, as.numeric)))
	}

	### Start Markov Chain at different simplex vertices and [1/3, 1/3, 1/3].
	### I recommed you just use printed report to inspect chain at first, as there is very
	### high auto-correlation, and restart chain after modifying concentrations
	### to change the sampling envelope.


	piStart = c(0.801250452048145, 0.188660534125485, 0.01008901)
	concentrations = c(0.80, 0.19, 0.01)* 1500
	weightSamples = runMetropolisHastings(150, piStart, concentrations)
	sampleMatrix = t(sapply(weightSamples, as.numeric))
	weights = colMeans(sampleMatrix) # These are roughly the mixing proportions.

	# 11. Plot mixture density.

	mixtureDF = getFunctionDF(getMixture(weights))

	mPlot <- ggplot(data = NULL) +
	labs(
	title = "Mixture Density for Positive CV Score Distribution",
	x ="Positive CV Score (Dollars)",
	y = NULL) +
	theme(plot.title = element_text(face = "bold", size = 25)) +
	theme_bw() +
	geom_histogram(
	aes(x = pcv, y = ..density..),
	data = pcvDF,
	fill = "light blue",
	alpha = 0.5,
	colour = "grey",
	lwd = 0.2,
	binwidth = 25) +
	geom_path(
	aes(x = x, y = y, colour = "1"),
	data = pcvDensityDF,
	lwd = 0.65,
	linetype = 2) +
	geom_path(
	aes(x = x, y = y, colour = "2"),
	data = mixtureDF,
	lwd = 0.65,
	linetype = 2) +
	scale_x_continuous(limits = c(0, numNodes)) +
	coord_cartesian(xlim = c(0, 2500), ylim = c(0, 0.0125)) +
	scale_colour_manual(
	values=c("black", "red"),
	name="Density Curves",
	breaks = c("1", "2"),
	labels=c(
	"Kernel Density",
	"Inverse-Gamma Mixture Density"))
	mPlot

	# 12. Compute CV cutoff.

	quantileFunc = function(p, func) {
	output = Vectorize(function(x) {
	integrate(func, 0, x)$value - p
	})

	return(output)
	}

	cvCutoff = uniroot.all(quantileFunc(0.95, getMixture(weights)), c(0.0001, 1000))