levithatcher · January 25, 2019 21:15 · Jan 9, 2017 · Jan 9, 2017
diff --git a/CountyPublicHealthChoropleth.R b/CountyPublicHealthChoropleth.R
@@ -56,10 +56,10 @@ createPercentiles <- function(x) {
 }
 
 
-
 library(healthcareai)
 
 # Start analysis -- read in data
+# Data comes from here: http://www.countyhealthrankings.org/rankings/data
 df <- read.csv('2015 CHR Analytic Data.csv')
 
 # Remove state-summary rows

diff --git a/CountyPublicHealthChoropleth.R b/CountyPublicHealthChoropleth.R
@@ -0,0 +1,120 @@
+removeCommasInNumber <- function(column) {
+  column <- as.numeric(gsub(",", "", column))
+  column
+}
+
+createChoropleth <- function(df, 
+                             colToPlot, 
+                             title,
+                             legend,
+                             numColors=1, 
+                             NAReplace=NULL) {
+
+  library(choroplethr)
+  library(choroplethrMaps)
+  library(ggplot2)
+
+  # Remove commas from column of interest
+  if (!is.numeric(df[[colToPlot]])) {
+    df[[colToPlot]] <- removeCommasInNumber(df[[colToPlot]])
+  }
+
+  # Remove state rows from dataset
+  df <- subset(df, COUNTYCODE != 0)
+
+  df$STATECODE <- as.integer(df$STATECODE)
+  df$COUNTYCODE <- as.integer(df$COUNTYCODE)
+
+  # Pad county digits
+  df$COUNTYCODE <- sprintf("%03d", df$COUNTYCODE)
+
+  # Concatenate and create FIPS
+  df$FIPSCODE <- as.numeric(paste0(df$STATECODE,df$COUNTYCODE))
+
+  # Reduce dataset and rename cols for county_choropleth func
+  df <- subset(df, select = c('FIPSCODE', colToPlot))
+  colnames(df) <- c("region","value")
+
+  print("NA count:")
+  print(count(is.na(df["value"])))
+
+  # Fill NA cells with something (so choropleth works)
+  if (!is.null(NAReplace)) {
+    df["value"][is.na(df["value"])] <- NAReplace
+  }
+
+  str(df)
+
+  # Plot data on a US map
+  county_choropleth(df, num_colors = numColors, legend = legend) +
+    ggtitle(title) +
+    theme(plot.title = element_text(hjust = 0.5))
+}
+
+createPercentiles <- function(x) {
+  (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
+}
+
+
+
+library(healthcareai)
+
+# Start analysis -- read in data
+df <- read.csv('2015 CHR Analytic Data.csv')
+
+# Remove state-summary rows
+df <- subset(df, COUNTYCODE != 0)
+
+# Prepare columns
+# Convert to percentage when plotting its choropleth
+#df$Low.birthweight.Value <- df$Low.birthweight.Value * 100
+
+df$Median.household.income.Value <- removeCommasInNumber(df$Median.household.income.Value)
+df$Premature.death.Value <- removeCommasInNumber(df$Premature.death.Value)
+
+# Change from factors to numerics
+#df$Median.household.income.Value <- as.numeric(df$Median.household.income.Value)
+#df$Premature.death.Value <- as.numeric(df$Premature.death.Value)
+
+# Change to percentiles
+df$Median.household.income.Percentile <- round(createPercentiles(df$Median.household.income.Value), 2)
+
+# Change to percentiles
+# We subtract 1 from income and do abs, since we want 100th percentile to be desirable
+df$Low.birthweight.Percentile <- round(abs(createPercentiles(df$Low.birthweight.Value) - 1), 2)
+df$Premature.death.Percentile <- round(abs(createPercentiles(df$Premature.death.Value) - 1), 2)
+
+# Calculate diff between county income and health outcomes (leads to -1 to 1)
+df$LBW.PATWI <- round(df$Low.birthweight.Percentile - 
+                                       df$Median.household.income.Percentile, 2)
+
+df$Early.Death.PATWI <- round(df$Premature.death.Percentile - 
+                                            df$Median.household.income.Percentile, 2)
+
+df <- subset(df, select = c('County','State','Median.household.income.Value','Median.household.income.Percentile','Low.birthweight.Percentile','LBW.PATWI','Premature.death.Percentile','Early.Death.PATWI'))
+
+str(df)
+
+# Plot choropleth
+createChoropleth(df,
+                 #colToPlot = 'Median.household.income.Value',
+                 #colToPlot = 'Premature.death.Value', 
+                 #colToPlot = 'Low.birthweight.Value',
+                 #colToPlot = 'LBWPcntlMinusIncomePcntl',
+                 colToPlot = 'EarlyDeathPctlMinusIncomePctl',
+                 title = 'Prime Years Lost Compared to Income by County',
+                 legend = 'Percentile Diff',
+                 numColors = 7,
+                 NAReplace = 0)
+
+# Find statistical relationships
+df$Low.birthweight.Value <- imputeColumn(df$Low.birthweight.Value)
+
+calculateTargetedCorrelations(df[,c('Median.household.income.Value',
+                                    'Premature.death.Value',
+                                    'Low.birthweight.Value')],
+                              targetCol = 'Median.household.income.Value')
+
+lm(Premature.death.Value ~ Median.household.income.Value, data = df)
+
+lm(Low.birthweight.Value ~ Median.household.income.Value, data = df)