Skip to content

Instantly share code, notes, and snippets.

@KBlansit
Created November 26, 2016 22:14
Show Gist options
  • Save KBlansit/278a979237cc129922b3f511017d1580 to your computer and use it in GitHub Desktop.
Save KBlansit/278a979237cc129922b3f511017d1580 to your computer and use it in GitHub Desktop.
# import libraries
require(data.table)
require(dplyr)
# define system variables
PROCEDURE <- '^proc_p$|^oproc[0-9]{1,2}'
DIAGNOSIS <- '^diag_p$|^odiag[0-9]{1,2}'
ROBOTIC_PROCEDURE <- '^174[^1]{1}'
MIN_VOL <- 25
# diagnosis strings
CCY_DIAGNOSIS <- '^5770|^574[0-9]{0,1}'
APPY_DIAGNOSIS <- '^54[1-2]{1}|^540[0,1,9]{1}'
GBP_DIAGNOSIS <- '^2780[0,1]{0,1}|^278[8,1]{1}'
HERNIA_DIAGNOSIS <- '^5522[0,1,9]{0,1}|^5512[0,1,9]{0,1}|^5532[0,1,9]{0,1}'
COLEC_DIAGNOSIS <- '^153[0-9]{0,1}'
PROST_DIAGNOSIS <- '^185|^2334'
HYSTR_DIAGNOSIS <- '^6170|^5680|^6146|^22[0,1]{1}|620[1,2]{1}|^621|^614[1,2,3,4,8]{0,1}|^625[8,9]{1}|^179|180[0,1,8,9]{0,1}'
MYOM_DIAGNOSIS <- '^218[0,1,2,9]{0,1}|^6541'
# procedure strings
CCY_PROCEDURE <- '^512[1,2,3,4]{0,1}'
APPY_PROCEDURE <- '^470[1,9]{0,1}|^54[5,2]{1}1'
GBP_PROCEDURE <- '^443[1,9,8]{1}|^445|449[5,6,7,8,9]{1}|446[8,9]{1}'
HERNIA_PROCEDURE <- '^535[1,9]{1}|536[1,2,9]{0,1}'
COLEC_PROCEDURE <- '^457[1,2,3,4,5,6,9]|^458|^485|^173[1,9]{0,1}'
PROST_PROCEDURE <- '^602[1,9]{0,1}|^609[6,7,9]{1}|^60[3,4,5]{1}|^606[1,2,9]{1}'
HYSTR_PROCEDURE <- '^68[0-9]{0,2}'
MYOM_PROCEDURE <- '^68[0-9]{0,2}|^6919'
# names for procedures
CCY_NAME = 'CCY'
APPY_NAME = 'APPY'
GBP_NAME = 'GPB'
HERNIA_NAME = 'HERNIA'
COLEC_NAME = 'COLEC'
PROST_NAME = 'PROST'
HYSTR_NAME = 'HYST'
MYOM_NAME = 'MYOM'
regex_table <- rbind(
c(CCY_NAME, CCY_PROCEDURE, CCY_DIAGNOSIS),
c(APPY_NAME, APPY_PROCEDURE, APPY_DIAGNOSIS),
c(GBP_NAME, GBP_PROCEDURE, GBP_DIAGNOSIS),
c(HERNIA_NAME, HERNIA_PROCEDURE, HERNIA_DIAGNOSIS),
c(COLEC_NAME, COLEC_PROCEDURE, COLEC_DIAGNOSIS),
c(PROST_NAME, PROST_PROCEDURE, PROST_DIAGNOSIS),
c(HYSTR_NAME, HYSTR_PROCEDURE, HYSTR_DIAGNOSIS),
c(MYOM_NAME, MYOM_PROCEDURE, MYOM_DIAGNOSIS)
)
colnames(regex_table) <- c('Name', 'Procedure', 'Diagnosis')
regex_table <- as.data.frame(regex_table)
regex_table$Name <- as.character(regex_table$Name)
regex_table$Procedure <- as.character(regex_table$Procedure)
regex_table$Diagnosis <- as.character(regex_table$Diagnosis)
# read in data
oshpd_08 <- read.csv('OSHPD_08.csv', header = T)
oshpd_09 <- read.csv('OSHPD_09.csv', header = T)
oshpd_10 <- read.csv('OSHPD_10.csv', header = T)
oshpd_11 <- read.csv('OSHPD_11.csv', header = T)
oshpd_12 <- read.csv('OSHPD_12.csv', header = T)
oshpd_13 <- read.csv('OSHPD_13.csv', header = T)
processOshpd <- function(year, dt, regex_table, DIAGNOSIS, PROCEDURE, robot_query) {
processRegexRow <- function(regex_row, mtx_diag, mtx_proc, robot_query) {
queryData <- function(mtx, regexQuery) {
# mtx: matrix to query
# regexQuery: regex to query
loc <- grepl(regexQuery, mtx)
dim(loc) <- dim(mtx)
sums <- as.logical(rowSums(loc) > 0)
}
if(dim(mtx_diag)[1] != dim(mtx_proc)[1]) {
stop('mtx_diag and mtx_proc require similiar rows')
}
rslt <- rep(NA, dim(mtx_diag)[1])
proc_rows <- queryData(mtx_diag, regex_row['Diagnosis']) * queryData(mtx_proc, regex_row['Procedure'])
rob_rows <- queryData(mtx_diag, regex_row['Diagnosis']) * queryData(mtx_proc, robot_query)
proc_rows <- as.logical(proc_rows)
rob_rows <- as.logical(rob_rows)
rslt[proc_rows] <- 'non-robotic'
rslt[rob_rows] <- 'robotic'
rtn_dt = as.data.frame(rslt)
colnames(rtn_dt) <- as.character(regex_row['Name'])
return(rtn_dt)
}
mtx_diag <- as.matrix(dt[, grepl(DIAGNOSIS, names(dt)), with = FALSE])
mtx_proc <- as.matrix(dt[, grepl(PROCEDURE, names(dt)), with = FALSE])
lsted_dts <- apply(regex_table, 1, FUN = processRegexRow,
mtx_diag = mtx_diag, mtx_proc = mtx_proc, robot_query = ROBOTIC_PROCEDURE)
procs <- do.call(cbind, lsted_dts)
vld_rows <- rowSums(is.na(procs)) < nrow(regex_table)
multi_rows <- rowSums(is.na(procs)) < nrow(regex_table) - 1
# hard code vars
poor_insur <- c(2, 5, 7)
other_cols <- c('los', 'charge', 'adm_src', 'pay_cat', 'oshpd_id')
# deal with single diagnosis
singular_dt <- cbind(procs[vld_rows, ], dt[vld_rows, other_cols, with = FALSE])
singular_dt$year <- year
return(singular_dt)
}
# process OSHPD data
df_08 <- processOshpd('08', data.table(oshpd_08), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
df_09 <- processOshpd('09', data.table(oshpd_09), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
df_10 <- processOshpd('10', data.table(oshpd_10), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
df_11 <- processOshpd('11', data.table(oshpd_11), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
df_12 <- processOshpd('12', data.table(oshpd_12), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
df_13 <- processOshpd('13', data.table(oshpd_13), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
# bind data
df <- rbind(df_08, df_09, df_10, df_11, df_12, df_13)
# aggegrate
aggegrateHosps <- function(regex_table, df) {
aggegrateTotal <- function(df, regex_table) {
robRows <- rowSums(df[regex_table$Name] == 'robotic', na.rm = TRUE) > 0
df$TOTAL <- 'non-robotic'
df$TOTAL[robRows] <- 'robotic'
temp_df <- count(df, TOTAL, oshpd_id, year)
temp_df<- temp_df[!is.na(temp_df[1]),]
wide_df <- dcast(temp_df, oshpd_id + year ~ TOTAL, value.var = 'n')
wide_df$robotic[is.na(wide_df$robotic)] <- 0 # this is okay
colnames(wide_df)[grepl('robotic$', names(wide_df))] <-
paste("TOTAL", names(wide_df)[grepl('robotic$', names(wide_df))], sep = '_')
wide_df$TOTAL_VOL <- wide_df$`TOTAL_non-robotic` + wide_df$TOTAL_robotic
wide_df$TOTAL_PROP <- wide_df$TOTAL_robotic / wide_df$TOTAL_VOL
rtn_cols <- c("oshpd_id", "year", "TOTAL_VOL", "TOTAL_robotic", "TOTAL_PROP")
return(wide_df[rtn_cols])
}
aggegrateType <- function(type, df) {
temp_df <- eval(parse(text = paste('count(df,', type, ', oshpd_id, year)')))
colnames(temp_df)[1] <- type
# remove NAs
temp_df<- temp_df[!is.na(temp_df[1]),]
typeFormula <- as.formula(paste('oshpd_id + year ~ ', type, sep = ''))
wide_df <- dcast(temp_df, typeFormula, value.var = 'n')
wide_df$robotic[is.na(wide_df$robotic)] <- 0 # this is okay
colnames(wide_df)[grepl('robotic$', names(wide_df))] <-
paste(type, names(wide_df)[grepl('robotic$', names(wide_df))], sep = '_')
typeTotal <- cbind(wide_df[paste(type, "_non-robotic", sep = "")] +
wide_df[paste(type, "_robotic", sep = "")])
typeProp <- cbind(wide_df[paste(type, "_robotic", sep = "")] / typeTotal)
newCols <- as.data.frame(cbind(typeTotal, typeProp))
colnames(newCols) <- paste(type, c("_VOL", "_PROP"), sep = "")
wide_df <- cbind(wide_df, newCols)
rtn_cols <- c("oshpd_id", "year", paste(type, c("_VOL", "_robotic", "_PROP"), sep = ""))
return(wide_df[rtn_cols])
}
mainDf <- aggegrateTotal(df, regex_table)
dfLst <- lapply(regex_table$Name, aggegrateType, df = df)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment