Created
December 13, 2023 14:51
-
-
Save benzipperer/23280a6390592839a92c56c29b463929 to your computer and use it in GitHub Desktop.
weighted percentiles and reshaping
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(tidyverse) | |
library(MetricsWeighted) | |
# here's how to calculate multiple weighted percentiles by year | |
# and reshape them so data is long in year but wide in percentiles | |
# below I explain this step by step | |
# first grab some data | |
cps_data <- epiextractr::load_org(1979:2022, year, orgwgt, wage) %>% | |
filter(wage > 0) | |
# provide a vector of percentiles of interest | |
p <- c(10, 50, 90) | |
# calculate percentiles and reshape data | |
cps_data %>% | |
reframe( | |
percentile = p, | |
value = weighted_quantile(wage, w = orgwgt, probs = p / 100), | |
.by = year | |
) %>% | |
pivot_wider(id_cols = year, names_from = percentile, values_from = value) | |
############################################# | |
### INCREMENTAL STEP BY STEP INSTRUCTIONS ### | |
############################################# | |
# simple example: calculate the (weighted) median wage in 2022 | |
cps_data %>% | |
filter(year == 2022) %>% | |
summarize(p50 = weighted_median(wage, w = orgwgt)) | |
# use weighted_quantile to calculate a different percentile, like 10th percentile | |
# note that the probs argument to weighted_quantile is between 0 and 1 | |
cps_data %>% | |
filter(year == 2022) %>% | |
summarize(p10 = weighted_quantile(wage, w = orgwgt, probs = 0.10)) | |
# to calculate multiple percentiles, provide a vector of percentiles | |
# and also switch from summarize() to reframe() | |
# reframe() allows for multiple rows of results, as opposed to a single summary row | |
# note that the probs argument to weighted_quantile is still between 0 and 1 | |
p <- c(10, 50, 90) | |
cps_data %>% | |
filter(year == 2022) %>% | |
reframe( | |
percentile = p, | |
value = weighted_quantile(wage, w = orgwgt, probs = p / 100) | |
) | |
# now we can use the .by argument of reframe to do this by year | |
cps_data %>% | |
reframe( | |
percentile = p, | |
value = weighted_quantile(wage, w = orgwgt, probs = p / 100), | |
.by = year | |
) | |
# finally reshape the data so that it is long in years and wide in percentiles | |
cps_data %>% | |
reframe( | |
percentile = p, | |
value = weighted_quantile(wage, w = orgwgt, probs = p / 100), | |
.by = year | |
) %>% | |
pivot_wider(id_cols = year, names_from = percentile, values_from = value) | |
# extra credit, add "th" to the percentile names | |
cps_data %>% | |
reframe( | |
percentile = paste0(p, "th"), | |
value = weighted_quantile(wage, w = orgwgt, probs = p / 100), | |
.by = year | |
) %>% | |
pivot_wider(id_cols = year, names_from = percentile, values_from = value) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment