Skip to content

Instantly share code, notes, and snippets.

@levithatcher
Last active January 25, 2019 21:15

Revisions

  1. levithatcher revised this gist Jan 9, 2017. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion CountyPublicHealthChoropleth.R
    Original file line number Diff line number Diff line change
    @@ -56,10 +56,10 @@ createPercentiles <- function(x) {
    }



    library(healthcareai)

    # Start analysis -- read in data
    # Data comes from here: http://www.countyhealthrankings.org/rankings/data
    df <- read.csv('2015 CHR Analytic Data.csv')

    # Remove state-summary rows
  2. levithatcher created this gist Jan 9, 2017.
    120 changes: 120 additions & 0 deletions CountyPublicHealthChoropleth.R
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,120 @@
    removeCommasInNumber <- function(column) {
    column <- as.numeric(gsub(",", "", column))
    column
    }

    createChoropleth <- function(df,
    colToPlot,
    title,
    legend,
    numColors=1,
    NAReplace=NULL) {

    library(choroplethr)
    library(choroplethrMaps)
    library(ggplot2)

    # Remove commas from column of interest
    if (!is.numeric(df[[colToPlot]])) {
    df[[colToPlot]] <- removeCommasInNumber(df[[colToPlot]])
    }

    # Remove state rows from dataset
    df <- subset(df, COUNTYCODE != 0)

    df$STATECODE <- as.integer(df$STATECODE)
    df$COUNTYCODE <- as.integer(df$COUNTYCODE)

    # Pad county digits
    df$COUNTYCODE <- sprintf("%03d", df$COUNTYCODE)

    # Concatenate and create FIPS
    df$FIPSCODE <- as.numeric(paste0(df$STATECODE,df$COUNTYCODE))

    # Reduce dataset and rename cols for county_choropleth func
    df <- subset(df, select = c('FIPSCODE', colToPlot))
    colnames(df) <- c("region","value")

    print("NA count:")
    print(count(is.na(df["value"])))

    # Fill NA cells with something (so choropleth works)
    if (!is.null(NAReplace)) {
    df["value"][is.na(df["value"])] <- NAReplace
    }

    str(df)

    # Plot data on a US map
    county_choropleth(df, num_colors = numColors, legend = legend) +
    ggtitle(title) +
    theme(plot.title = element_text(hjust = 0.5))
    }

    createPercentiles <- function(x) {
    (x - min(x, na.rm = TRUE))/(max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
    }



    library(healthcareai)

    # Start analysis -- read in data
    df <- read.csv('2015 CHR Analytic Data.csv')

    # Remove state-summary rows
    df <- subset(df, COUNTYCODE != 0)

    # Prepare columns
    # Convert to percentage when plotting its choropleth
    #df$Low.birthweight.Value <- df$Low.birthweight.Value * 100

    df$Median.household.income.Value <- removeCommasInNumber(df$Median.household.income.Value)
    df$Premature.death.Value <- removeCommasInNumber(df$Premature.death.Value)

    # Change from factors to numerics
    #df$Median.household.income.Value <- as.numeric(df$Median.household.income.Value)
    #df$Premature.death.Value <- as.numeric(df$Premature.death.Value)

    # Change to percentiles
    df$Median.household.income.Percentile <- round(createPercentiles(df$Median.household.income.Value), 2)

    # Change to percentiles
    # We subtract 1 from income and do abs, since we want 100th percentile to be desirable
    df$Low.birthweight.Percentile <- round(abs(createPercentiles(df$Low.birthweight.Value) - 1), 2)
    df$Premature.death.Percentile <- round(abs(createPercentiles(df$Premature.death.Value) - 1), 2)

    # Calculate diff between county income and health outcomes (leads to -1 to 1)
    df$LBW.PATWI <- round(df$Low.birthweight.Percentile -
    df$Median.household.income.Percentile, 2)

    df$Early.Death.PATWI <- round(df$Premature.death.Percentile -
    df$Median.household.income.Percentile, 2)

    df <- subset(df, select = c('County','State','Median.household.income.Value','Median.household.income.Percentile','Low.birthweight.Percentile','LBW.PATWI','Premature.death.Percentile','Early.Death.PATWI'))

    str(df)

    # Plot choropleth
    createChoropleth(df,
    #colToPlot = 'Median.household.income.Value',
    #colToPlot = 'Premature.death.Value',
    #colToPlot = 'Low.birthweight.Value',
    #colToPlot = 'LBWPcntlMinusIncomePcntl',
    colToPlot = 'EarlyDeathPctlMinusIncomePctl',
    title = 'Prime Years Lost Compared to Income by County',
    legend = 'Percentile Diff',
    numColors = 7,
    NAReplace = 0)

    # Find statistical relationships
    df$Low.birthweight.Value <- imputeColumn(df$Low.birthweight.Value)

    calculateTargetedCorrelations(df[,c('Median.household.income.Value',
    'Premature.death.Value',
    'Low.birthweight.Value')],
    targetCol = 'Median.household.income.Value')

    lm(Premature.death.Value ~ Median.household.income.Value, data = df)

    lm(Low.birthweight.Value ~ Median.household.income.Value, data = df)