Created
July 20, 2018 14:49
-
-
Save btskinner/c8654a4369900d7914310fbb77103c14 to your computer and use it in GitHub Desktop.
Add variable / value labels to IPEDS data in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
################################################################################ | |
## | |
## <PROJ> Add variable / value labels to IPEDS data in R | |
## <FILE> label_ipeds.r | |
## <AUTH> Benjamin Skinner @btskinner | |
## <INIT> 12 July 2018 | |
## | |
################################################################################ | |
## USAGE ----------------------------------------------------------------------- | |
## | |
## (1) download relevant Stata data and label files from IPEDS (leave zipped) | |
## | |
## - Stata data: *_Data_Stata.zip | |
## - Stata labels: *_Stata.zip | |
## | |
## (2) change input/output directories below if desired | |
## | |
## (3) run | |
## | |
## NB: You can download zipped IPEDS files using < downloadipeds.r > script @ | |
## https://github.com/btskinner/downloadipeds | |
## ----------------------------------------------------------------------------- | |
## ----------------------------------------------------------------------------- | |
## SET I/O DIRECTORIES (DEFAULT = everything in the current directory) | |
## ----------------------------------------------------------------------------- | |
## If directory structure like this EXAMPLE: | |
## | |
## ./ | |
## |__/r_data | |
## | | |
## |__/stata_data | |
## | |-- ADM2014_Data_Stata.zip | |
## | |-- ADM2015_Data_Stata.zip | |
## | | |
## |__/stata_labels | |
## | |-- ADM2014_Stata.zip | |
## | |-- ADM2015_Stata.zip | |
## | | |
## |-- label_ipeds.r | |
## | |
## Then: | |
## | |
## labs_ddir <- file.path('.', 'stata_labels') | |
## stata_ddir <- file.path('.', 'stata_data') | |
## r_ddir <- file.path('.', 'r_data') | |
labs_ddir <- file.path('.') # path to folder w/ zipped label files | |
stata_ddir <- file.path('.') # path to folder w/ zipped Stata data | |
r_ddir <- file.path('.') # path to output folder for Rdata files | |
## ----------------------------------------------------------------------------- | |
## WANT NOISIER OUTPUT? (DEFAULT = FALSE) | |
## ----------------------------------------------------------------------------- | |
## allow readr::read_csv() messages? | |
noisy <- FALSE | |
## ----------------------------------------------------------------------------- | |
## LIBRARIES & FUNCTIONS | |
## ----------------------------------------------------------------------------- | |
## libraries | |
libs <- c('tidyverse','labelled') | |
lapply(libs, require, character.only = TRUE) | |
read_zip <- function(zipfile, type, noisy) { | |
## create a name for the dir where we'll unzip | |
zipdir <- tempfile() | |
## create the dir using that name | |
dir.create(zipdir) | |
## unzip the file into the dir | |
unzip(zipfile, exdir = zipdir) | |
## get the files into the dir | |
files <- list.files(zipdir, recursive = TRUE) | |
## chose rv file if more than two b/c IPEDS likes revisions | |
if (length(files) > 1) { | |
file <- grep('*_rv_*', tolower(files), value = TRUE) | |
if (length(file) == 0) { | |
file <- grep('*\\.csv', files, value = TRUE) | |
} | |
} else { | |
file <- files[1] | |
} | |
## get the full name of the file | |
file <- file.path(zipdir, file) | |
## read the file | |
if (type == 'csv') { | |
if (noisy) { | |
out <- read_csv(file) | |
} else { | |
out <- suppressMessages(suppressWarnings(read_csv(file, | |
progress = FALSE))) | |
} | |
} else { | |
out <- readLines(file, encoding = 'latin1') | |
} | |
## remove tmp | |
unlink(zipdir, recursive = TRUE) | |
## return | |
return(out) | |
} | |
read_labels <- function(zipfile) { | |
## read in label file | |
labs <- read_zip(zipfile, 'do') | |
## get insheet line and add one to get next line | |
line_no <- grep('insheet', labs) + 1 | |
## drop header | |
labs <- labs[line_no:length(labs)] | |
## drop first asterisk | |
labs <- gsub('^\\*(.+)$', '\\1', labs) | |
## return | |
return(labs) | |
} | |
assign_var_labels <- function(df, label_vec) { | |
## get variable label lines | |
varlabs <- grep('^label variable', label_vec, value = TRUE) | |
## if no labels, exit | |
if (length(varlabs) == 0) { return(df) } | |
## get variables that have labels | |
vars <- unlist(lapply(varlabs, function(x) { strsplit(x, ' ')[[1]][[3]] })) | |
## get the labels belonging to those variables | |
labs <- gsub('label variable .+"(.+)"', '\\1', varlabs) | |
## create list | |
varlabs <- setNames(as.list(labs), vars) | |
## assign to variables | |
var_label(df) <- varlabs | |
## return new data frame | |
return(df) | |
} | |
assign_val_labels <- function(df, label_vec) { | |
## get value label lines | |
vallabs <- grep('^label define', label_vec, value = TRUE) | |
## if no labels, exit | |
if (length(vallabs) == 0) { return(df) } | |
## get unique defined labels | |
labdefs <- unique(gsub('^label define (\\w+).+', '\\1', vallabs)) | |
## get label value lines | |
vars <- grep('^label values', label_vec, value = TRUE) | |
## make list of variable plus its value definition | |
vardef <- setNames(as.list(gsub('^label values (\\w+).+', '\\1', vars)), | |
gsub('^label values \\w+ (\\w+)\\*?.*', '\\1', vars)) | |
## make unique b/c of some double labels | |
vardef <- vardef[!duplicated(vardef)] | |
## loop through each variable | |
for (i in 1:length(labdefs)) { | |
## get label | |
labdef <- labdefs[i] | |
## skip if missing | |
if (!is.null(vardef[[labdef]])) { | |
## subset lines with this definition | |
pattern <- paste0('\\b', labdef, '\\b') | |
vallab <- grep(pattern, vallabs, value = TRUE) | |
## get values | |
pattern <- paste0('label define ', labdef, ' +(-?\\w+).+') | |
values <- gsub(pattern, '\\1', vallab) | |
## convert values to class of variable...hacky fix here | |
suppressWarnings(class(values) <- class(df[[vardef[[labdef]]]])) | |
## get labels | |
pattern <- paste0('label define ', labdef, ' .+"(.+)" ?(, ?add ?)?') | |
labels <- gsub(pattern, '\\1', vallab) | |
## make list | |
labels <- setNames(values, labels) | |
## label values | |
df[[vardef[[labdef]]]] <- labelled(df[[vardef[[labdef]]]], labels) | |
} | |
} | |
## return dataframe | |
return(df) | |
} | |
assign_imp_labels <- function(df, label_vec) { | |
## find line numbers surrounding imputation values | |
line_no_start <- grep('imputation.*variable(s)?', label_vec) + 1 | |
## if no imputation labels, exit | |
if (length(line_no_start) == 0) { return(df) } | |
line_no_stop <- grep('^tab\\b', label_vec)[[1]] - 1 | |
labs <- label_vec[line_no_start:line_no_stop] | |
## get variables starting with 'x' | |
vars <- df %>% select(starts_with('x')) %>% names(.) | |
## make list of each impute value and label | |
values <- gsub('(\\w\\b).+', '\\1', labs) | |
labels <- gsub('\\w\\b (.+)', '\\1', labs) | |
labels <- setNames(values, labels) | |
## loop through each imputed variable | |
for (v in vars) { | |
if (class(df[[v]]) == class(values)) { | |
df[[v]] <- labelled(df[[v]], labels) | |
} | |
} | |
## return dataframe | |
return(df) | |
} | |
## ----------------------------------------------------------------------------- | |
## RUN BY LOOPING THROUGH FILES | |
## ----------------------------------------------------------------------------- | |
## get list of zip files | |
stata_zip <- grep('*_Data_Stata\\.zip', list.files(stata_ddir), value = TRUE) | |
stata_lab <- grep('_Stata\\.zip', list.files(labs_ddir), value = TRUE) | |
## if stata_ddir and labs_ddir are the same, subset | |
if (identical(stata_ddir, labs_ddir)) { | |
stata_lab <- stata_lab[!(stata_lab %in% stata_zip)] | |
} | |
## loop | |
for (i in 1:length(stata_zip)) { | |
f <- stata_zip[i] | |
## message | |
message(paste0('Working with: ', f)) | |
## get basename | |
fname <- gsub('(^.+)_Data_Stata.zip', '\\1', f) | |
## get label file | |
lab_file <- grep(paste0('^', fname, '_Stata'), stata_lab, value = TRUE) | |
## skip if missing label file | |
if (length(lab_file) == 0) { | |
message(paste0(' NO LABEL FILE FOR: ', fname, ', skipping')) | |
next | |
} | |
## read in data | |
df <- read_zip(file.path(stata_ddir, f), 'csv', noisy) %>% | |
rename_all(tolower) | |
## get labels | |
labs <- read_labels(file.path(labs_ddir, lab_file)) | |
## assign variable labels | |
df <- assign_var_labels(df, labs) | |
## assign value labels | |
df <- assign_val_labels(df, labs) | |
## assign imputation labels | |
df <- assign_imp_labels(df, labs) | |
## rename data frame to match file name | |
assign(tolower(fname), df) | |
## save | |
save(list = tolower(fname), | |
file = file.path(r_ddir, paste0(fname, '.Rdata'))) | |
## garbage collect every 10 loops...may help...idk | |
if (i %% 10 == 0) { gc() } | |
} | |
## ============================================================================= | |
## END SCRIPT | |
################################################################################ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment