Last active
January 25, 2019 21:15
Revisions
-
levithatcher revised this gist
Jan 9, 2017 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -56,10 +56,10 @@ createPercentiles <- function(x) { } library(healthcareai) # Start analysis -- read in data # Data comes from here: http://www.countyhealthrankings.org/rankings/data df <- read.csv('2015 CHR Analytic Data.csv') # Remove state-summary rows -
levithatcher created this gist
Jan 9, 2017 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,120 @@ removeCommasInNumber <- function(column) { column <- as.numeric(gsub(",", "", column)) column } createChoropleth <- function(df, colToPlot, title, legend, numColors=1, NAReplace=NULL) { library(choroplethr) library(choroplethrMaps) library(ggplot2) # Remove commas from column of interest if (!is.numeric(df[[colToPlot]])) { df[[colToPlot]] <- removeCommasInNumber(df[[colToPlot]]) } # Remove state rows from dataset df <- subset(df, COUNTYCODE != 0) df$STATECODE <- as.integer(df$STATECODE) df$COUNTYCODE <- as.integer(df$COUNTYCODE) # Pad county digits df$COUNTYCODE <- sprintf("%03d", df$COUNTYCODE) # Concatenate and create FIPS df$FIPSCODE <- as.numeric(paste0(df$STATECODE,df$COUNTYCODE)) # Reduce dataset and rename cols for county_choropleth func df <- subset(df, select = c('FIPSCODE', colToPlot)) colnames(df) <- c("region","value") print("NA count:") print(count(is.na(df["value"]))) # Fill NA cells with something (so choropleth works) if (!is.null(NAReplace)) { df["value"][is.na(df["value"])] <- NAReplace } str(df) # Plot data on a US map county_choropleth(df, num_colors = numColors, legend = legend) + ggtitle(title) + theme(plot.title = element_text(hjust = 0.5)) } createPercentiles <- function(x) { (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE)) } library(healthcareai) # Start analysis -- read in data df <- read.csv('2015 CHR Analytic Data.csv') # Remove state-summary rows df <- subset(df, COUNTYCODE != 0) # Prepare columns # Convert to percentage when plotting its choropleth #df$Low.birthweight.Value <- df$Low.birthweight.Value * 100 df$Median.household.income.Value <- removeCommasInNumber(df$Median.household.income.Value) df$Premature.death.Value <- removeCommasInNumber(df$Premature.death.Value) # Change from factors to numerics #df$Median.household.income.Value <- as.numeric(df$Median.household.income.Value) #df$Premature.death.Value <- as.numeric(df$Premature.death.Value) # Change to percentiles df$Median.household.income.Percentile <- round(createPercentiles(df$Median.household.income.Value), 2) # Change to percentiles # We subtract 1 from income and do abs, since we want 100th percentile to be desirable df$Low.birthweight.Percentile <- round(abs(createPercentiles(df$Low.birthweight.Value) - 1), 2) df$Premature.death.Percentile <- round(abs(createPercentiles(df$Premature.death.Value) - 1), 2) # Calculate diff between county income and health outcomes (leads to -1 to 1) df$LBW.PATWI <- round(df$Low.birthweight.Percentile - df$Median.household.income.Percentile, 2) df$Early.Death.PATWI <- round(df$Premature.death.Percentile - df$Median.household.income.Percentile, 2) df <- subset(df, select = c('County','State','Median.household.income.Value','Median.household.income.Percentile','Low.birthweight.Percentile','LBW.PATWI','Premature.death.Percentile','Early.Death.PATWI')) str(df) # Plot choropleth createChoropleth(df, #colToPlot = 'Median.household.income.Value', #colToPlot = 'Premature.death.Value', #colToPlot = 'Low.birthweight.Value', #colToPlot = 'LBWPcntlMinusIncomePcntl', colToPlot = 'EarlyDeathPctlMinusIncomePctl', title = 'Prime Years Lost Compared to Income by County', legend = 'Percentile Diff', numColors = 7, NAReplace = 0) # Find statistical relationships df$Low.birthweight.Value <- imputeColumn(df$Low.birthweight.Value) calculateTargetedCorrelations(df[,c('Median.household.income.Value', 'Premature.death.Value', 'Low.birthweight.Value')], targetCol = 'Median.household.income.Value') lm(Premature.death.Value ~ Median.household.income.Value, data = df) lm(Low.birthweight.Value ~ Median.household.income.Value, data = df)