KBlansit · November 26, 2016 22:14
diff --git a/OSHPD data pipeline b/OSHPD data pipeline
 # import libraries
 require(data.table)
 require(dplyr)

 # define system variables
 PROCEDURE <- '^proc_p$|^oproc[0-9]{1,2}'
 DIAGNOSIS <- '^diag_p$|^odiag[0-9]{1,2}'

 ROBOTIC_PROCEDURE <- '^174[^1]{1}'
 MIN_VOL <- 25

 # diagnosis strings
 CCY_DIAGNOSIS <- '^5770|^574[0-9]{0,1}'
 APPY_DIAGNOSIS <- '^54[1-2]{1}|^540[0,1,9]{1}'
 GBP_DIAGNOSIS <- '^2780[0,1]{0,1}|^278[8,1]{1}'
 HERNIA_DIAGNOSIS <- '^5522[0,1,9]{0,1}|^5512[0,1,9]{0,1}|^5532[0,1,9]{0,1}'
 COLEC_DIAGNOSIS <- '^153[0-9]{0,1}'
 PROST_DIAGNOSIS <- '^185|^2334'
 HYSTR_DIAGNOSIS <- '^6170|^5680|^6146|^22[0,1]{1}|620[1,2]{1}|^621|^614[1,2,3,4,8]{0,1}|^625[8,9]{1}|^179|180[0,1,8,9]{0,1}'
 MYOM_DIAGNOSIS <- '^218[0,1,2,9]{0,1}|^6541'

 # procedure strings
 CCY_PROCEDURE <- '^512[1,2,3,4]{0,1}'
 APPY_PROCEDURE <- '^470[1,9]{0,1}|^54[5,2]{1}1'
 GBP_PROCEDURE <- '^443[1,9,8]{1}|^445|449[5,6,7,8,9]{1}|446[8,9]{1}'
 HERNIA_PROCEDURE <- '^535[1,9]{1}|536[1,2,9]{0,1}'
 COLEC_PROCEDURE <- '^457[1,2,3,4,5,6,9]|^458|^485|^173[1,9]{0,1}'
 PROST_PROCEDURE <- '^602[1,9]{0,1}|^609[6,7,9]{1}|^60[3,4,5]{1}|^606[1,2,9]{1}'
 HYSTR_PROCEDURE <- '^68[0-9]{0,2}'
 MYOM_PROCEDURE <- '^68[0-9]{0,2}|^6919'

 # names for procedures
 CCY_NAME = 'CCY'
 APPY_NAME = 'APPY'
 GBP_NAME = 'GPB'
 HERNIA_NAME = 'HERNIA'
 COLEC_NAME = 'COLEC'
 PROST_NAME = 'PROST'
 HYSTR_NAME = 'HYST'
 MYOM_NAME = 'MYOM'

 regex_table <- rbind(
    c(CCY_NAME, CCY_PROCEDURE, CCY_DIAGNOSIS),
    c(APPY_NAME, APPY_PROCEDURE, APPY_DIAGNOSIS),
    c(GBP_NAME, GBP_PROCEDURE, GBP_DIAGNOSIS),
    c(HERNIA_NAME, HERNIA_PROCEDURE, HERNIA_DIAGNOSIS),
    c(COLEC_NAME, COLEC_PROCEDURE, COLEC_DIAGNOSIS),
    c(PROST_NAME, PROST_PROCEDURE, PROST_DIAGNOSIS),
    c(HYSTR_NAME, HYSTR_PROCEDURE, HYSTR_DIAGNOSIS),
    c(MYOM_NAME, MYOM_PROCEDURE, MYOM_DIAGNOSIS)
 )

 colnames(regex_table) <- c('Name', 'Procedure', 'Diagnosis')
 regex_table <- as.data.frame(regex_table)

 regex_table$Name <- as.character(regex_table$Name)
 regex_table$Procedure <- as.character(regex_table$Procedure)
 regex_table$Diagnosis <- as.character(regex_table$Diagnosis)

 # read in data
 oshpd_08 <- read.csv('OSHPD_08.csv', header = T)
 oshpd_09 <- read.csv('OSHPD_09.csv', header = T)
 oshpd_10 <- read.csv('OSHPD_10.csv', header = T)
 oshpd_11 <- read.csv('OSHPD_11.csv', header = T)
 oshpd_12 <- read.csv('OSHPD_12.csv', header = T)
 oshpd_13 <- read.csv('OSHPD_13.csv', header = T)

 processOshpd <- function(year, dt, regex_table, DIAGNOSIS, PROCEDURE, robot_query) {
    processRegexRow <- function(regex_row, mtx_diag, mtx_proc, robot_query) {
        queryData <- function(mtx, regexQuery) {
            # mtx: matrix to query
            # regexQuery: regex to query
            
            loc <- grepl(regexQuery, mtx)
            dim(loc) <- dim(mtx)
            sums <- as.logical(rowSums(loc) > 0)
        }
        
        if(dim(mtx_diag)[1] != dim(mtx_proc)[1]) {
            stop('mtx_diag and mtx_proc require similiar rows')
        }
        
        rslt <- rep(NA, dim(mtx_diag)[1])
        
        proc_rows <- queryData(mtx_diag, regex_row['Diagnosis']) * queryData(mtx_proc, regex_row['Procedure'])
        rob_rows <- queryData(mtx_diag, regex_row['Diagnosis']) * queryData(mtx_proc, robot_query)
        
        proc_rows <- as.logical(proc_rows)
        rob_rows <- as.logical(rob_rows)
        
        rslt[proc_rows] <- 'non-robotic'
        rslt[rob_rows] <- 'robotic'
        
        rtn_dt = as.data.frame(rslt)
        colnames(rtn_dt) <- as.character(regex_row['Name'])
        return(rtn_dt)
    }
    
    mtx_diag <- as.matrix(dt[, grepl(DIAGNOSIS, names(dt)), with = FALSE])
    mtx_proc <- as.matrix(dt[, grepl(PROCEDURE, names(dt)), with = FALSE])
    
    lsted_dts <- apply(regex_table, 1, FUN = processRegexRow,
             mtx_diag = mtx_diag, mtx_proc = mtx_proc, robot_query = ROBOTIC_PROCEDURE)
    procs <- do.call(cbind, lsted_dts)
    
    vld_rows <- rowSums(is.na(procs)) < nrow(regex_table)
    multi_rows <- rowSums(is.na(procs)) < nrow(regex_table) - 1
    
    # hard code vars
    poor_insur <- c(2, 5, 7)
    other_cols <- c('los', 'charge', 'adm_src', 'pay_cat', 'oshpd_id')
    
    # deal with single diagnosis 
    singular_dt <- cbind(procs[vld_rows, ], dt[vld_rows, other_cols, with = FALSE])
    singular_dt$year <- year
    
    return(singular_dt)
 }

 # process OSHPD data
 df_08 <- processOshpd('08', data.table(oshpd_08), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
 df_09 <- processOshpd('09', data.table(oshpd_09), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
 df_10 <- processOshpd('10', data.table(oshpd_10), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
 df_11 <- processOshpd('11', data.table(oshpd_11), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
 df_12 <- processOshpd('12', data.table(oshpd_12), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
 df_13 <- processOshpd('13', data.table(oshpd_13), regex_table, DIAGNOSIS, PROCEDURE, robot_query)

 # bind data
 df <- rbind(df_08, df_09, df_10, df_11, df_12, df_13)

 # aggegrate
 aggegrateHosps <- function(regex_table, df) {
    aggegrateTotal <- function(df, regex_table) {
        robRows <- rowSums(df[regex_table$Name] == 'robotic', na.rm = TRUE) > 0
        df$TOTAL <- 'non-robotic'
        df$TOTAL[robRows] <- 'robotic'
        
        temp_df <- count(df, TOTAL, oshpd_id, year)
        temp_df<- temp_df[!is.na(temp_df[1]),]
        
        wide_df <- dcast(temp_df, oshpd_id + year ~ TOTAL, value.var = 'n')
        wide_df$robotic[is.na(wide_df$robotic)] <- 0 # this is okay
        colnames(wide_df)[grepl('robotic$', names(wide_df))] <- 
            paste("TOTAL", names(wide_df)[grepl('robotic$', names(wide_df))], sep = '_')
            
        wide_df$TOTAL_VOL <- wide_df$`TOTAL_non-robotic` + wide_df$TOTAL_robotic
        wide_df$TOTAL_PROP <- wide_df$TOTAL_robotic / wide_df$TOTAL_VOL
        
        rtn_cols <- c("oshpd_id", "year", "TOTAL_VOL", "TOTAL_robotic", "TOTAL_PROP")
        return(wide_df[rtn_cols])
    }
    aggegrateType <- function(type, df) {
        temp_df <- eval(parse(text = paste('count(df,', type, ', oshpd_id, year)')))
        colnames(temp_df)[1] <- type
        
        # remove NAs
        temp_df<- temp_df[!is.na(temp_df[1]),]
        
        typeFormula <- as.formula(paste('oshpd_id + year ~ ', type, sep = ''))
        wide_df <- dcast(temp_df, typeFormula, value.var = 'n')
        wide_df$robotic[is.na(wide_df$robotic)] <- 0 # this is okay
        colnames(wide_df)[grepl('robotic$', names(wide_df))] <- 
            paste(type, names(wide_df)[grepl('robotic$', names(wide_df))], sep = '_')
        
        typeTotal <- cbind(wide_df[paste(type, "_non-robotic", sep = "")] +
                wide_df[paste(type, "_robotic", sep = "")])
        typeProp <- cbind(wide_df[paste(type, "_robotic", sep = "")] / typeTotal)
        newCols <- as.data.frame(cbind(typeTotal, typeProp))
        colnames(newCols) <- paste(type, c("_VOL", "_PROP"), sep = "")
        wide_df <- cbind(wide_df, newCols)
        
        rtn_cols <- c("oshpd_id", "year", paste(type, c("_VOL", "_robotic", "_PROP"), sep = ""))
        return(wide_df[rtn_cols])
    }
    
    mainDf <- aggegrateTotal(df, regex_table)
    dfLst <- lapply(regex_table$Name, aggegrateType, df = df)
    
 }
	# import libraries
	require(data.table)
	require(dplyr)

	# define system variables
	PROCEDURE <- '^proc_p$\|^oproc[0-9]{1,2}'
	DIAGNOSIS <- '^diag_p$\|^odiag[0-9]{1,2}'

	ROBOTIC_PROCEDURE <- '^174[^1]{1}'
	MIN_VOL <- 25

	# diagnosis strings
	CCY_DIAGNOSIS <- '^5770\|^574[0-9]{0,1}'
	APPY_DIAGNOSIS <- '^54[1-2]{1}\|^540[0,1,9]{1}'
	GBP_DIAGNOSIS <- '^2780[0,1]{0,1}\|^278[8,1]{1}'
	HERNIA_DIAGNOSIS <- '^5522[0,1,9]{0,1}\|^5512[0,1,9]{0,1}\|^5532[0,1,9]{0,1}'
	COLEC_DIAGNOSIS <- '^153[0-9]{0,1}'
	PROST_DIAGNOSIS <- '^185\|^2334'
	HYSTR_DIAGNOSIS <- '^6170\|^5680\|^6146\|^22[0,1]{1}\|620[1,2]{1}\|^621\|^614[1,2,3,4,8]{0,1}\|^625[8,9]{1}\|^179\|180[0,1,8,9]{0,1}'
	MYOM_DIAGNOSIS <- '^218[0,1,2,9]{0,1}\|^6541'

	# procedure strings
	CCY_PROCEDURE <- '^512[1,2,3,4]{0,1}'
	APPY_PROCEDURE <- '^470[1,9]{0,1}\|^54[5,2]{1}1'
	GBP_PROCEDURE <- '^443[1,9,8]{1}\|^445\|449[5,6,7,8,9]{1}\|446[8,9]{1}'
	HERNIA_PROCEDURE <- '^535[1,9]{1}\|536[1,2,9]{0,1}'
	COLEC_PROCEDURE <- '^457[1,2,3,4,5,6,9]\|^458\|^485\|^173[1,9]{0,1}'
	PROST_PROCEDURE <- '^602[1,9]{0,1}\|^609[6,7,9]{1}\|^60[3,4,5]{1}\|^606[1,2,9]{1}'
	HYSTR_PROCEDURE <- '^68[0-9]{0,2}'
	MYOM_PROCEDURE <- '^68[0-9]{0,2}\|^6919'

	# names for procedures
	CCY_NAME = 'CCY'
	APPY_NAME = 'APPY'
	GBP_NAME = 'GPB'
	HERNIA_NAME = 'HERNIA'
	COLEC_NAME = 'COLEC'
	PROST_NAME = 'PROST'
	HYSTR_NAME = 'HYST'
	MYOM_NAME = 'MYOM'

	regex_table <- rbind(
	c(CCY_NAME, CCY_PROCEDURE, CCY_DIAGNOSIS),
	c(APPY_NAME, APPY_PROCEDURE, APPY_DIAGNOSIS),
	c(GBP_NAME, GBP_PROCEDURE, GBP_DIAGNOSIS),
	c(HERNIA_NAME, HERNIA_PROCEDURE, HERNIA_DIAGNOSIS),
	c(COLEC_NAME, COLEC_PROCEDURE, COLEC_DIAGNOSIS),
	c(PROST_NAME, PROST_PROCEDURE, PROST_DIAGNOSIS),
	c(HYSTR_NAME, HYSTR_PROCEDURE, HYSTR_DIAGNOSIS),
	c(MYOM_NAME, MYOM_PROCEDURE, MYOM_DIAGNOSIS)
	)

	colnames(regex_table) <- c('Name', 'Procedure', 'Diagnosis')
	regex_table <- as.data.frame(regex_table)

	regex_table$Name <- as.character(regex_table$Name)
	regex_table$Procedure <- as.character(regex_table$Procedure)
	regex_table$Diagnosis <- as.character(regex_table$Diagnosis)

	# read in data
	oshpd_08 <- read.csv('OSHPD_08.csv', header = T)
	oshpd_09 <- read.csv('OSHPD_09.csv', header = T)
	oshpd_10 <- read.csv('OSHPD_10.csv', header = T)
	oshpd_11 <- read.csv('OSHPD_11.csv', header = T)
	oshpd_12 <- read.csv('OSHPD_12.csv', header = T)
	oshpd_13 <- read.csv('OSHPD_13.csv', header = T)

	processOshpd <- function(year, dt, regex_table, DIAGNOSIS, PROCEDURE, robot_query) {
	processRegexRow <- function(regex_row, mtx_diag, mtx_proc, robot_query) {
	queryData <- function(mtx, regexQuery) {
	# mtx: matrix to query
	# regexQuery: regex to query

	loc <- grepl(regexQuery, mtx)
	dim(loc) <- dim(mtx)
	sums <- as.logical(rowSums(loc) > 0)
	}

	if(dim(mtx_diag)[1] != dim(mtx_proc)[1]) {
	stop('mtx_diag and mtx_proc require similiar rows')
	}

	rslt <- rep(NA, dim(mtx_diag)[1])

	proc_rows <- queryData(mtx_diag, regex_row['Diagnosis']) * queryData(mtx_proc, regex_row['Procedure'])
	rob_rows <- queryData(mtx_diag, regex_row['Diagnosis']) * queryData(mtx_proc, robot_query)

	proc_rows <- as.logical(proc_rows)
	rob_rows <- as.logical(rob_rows)

	rslt[proc_rows] <- 'non-robotic'
	rslt[rob_rows] <- 'robotic'

	rtn_dt = as.data.frame(rslt)
	colnames(rtn_dt) <- as.character(regex_row['Name'])
	return(rtn_dt)
	}

	mtx_diag <- as.matrix(dt[, grepl(DIAGNOSIS, names(dt)), with = FALSE])
	mtx_proc <- as.matrix(dt[, grepl(PROCEDURE, names(dt)), with = FALSE])

	lsted_dts <- apply(regex_table, 1, FUN = processRegexRow,
	mtx_diag = mtx_diag, mtx_proc = mtx_proc, robot_query = ROBOTIC_PROCEDURE)
	procs <- do.call(cbind, lsted_dts)

	vld_rows <- rowSums(is.na(procs)) < nrow(regex_table)
	multi_rows <- rowSums(is.na(procs)) < nrow(regex_table) - 1

	# hard code vars
	poor_insur <- c(2, 5, 7)
	other_cols <- c('los', 'charge', 'adm_src', 'pay_cat', 'oshpd_id')

	# deal with single diagnosis
	singular_dt <- cbind(procs[vld_rows, ], dt[vld_rows, other_cols, with = FALSE])
	singular_dt$year <- year

	return(singular_dt)
	}

	# process OSHPD data
	df_08 <- processOshpd('08', data.table(oshpd_08), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
	df_09 <- processOshpd('09', data.table(oshpd_09), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
	df_10 <- processOshpd('10', data.table(oshpd_10), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
	df_11 <- processOshpd('11', data.table(oshpd_11), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
	df_12 <- processOshpd('12', data.table(oshpd_12), regex_table, DIAGNOSIS, PROCEDURE, robot_query)
	df_13 <- processOshpd('13', data.table(oshpd_13), regex_table, DIAGNOSIS, PROCEDURE, robot_query)

	# bind data
	df <- rbind(df_08, df_09, df_10, df_11, df_12, df_13)

	# aggegrate
	aggegrateHosps <- function(regex_table, df) {
	aggegrateTotal <- function(df, regex_table) {
	robRows <- rowSums(df[regex_table$Name] == 'robotic', na.rm = TRUE) > 0
	df$TOTAL <- 'non-robotic'
	df$TOTAL[robRows] <- 'robotic'

	temp_df <- count(df, TOTAL, oshpd_id, year)
	temp_df<- temp_df[!is.na(temp_df[1]),]

	wide_df <- dcast(temp_df, oshpd_id + year ~ TOTAL, value.var = 'n')
	wide_df$robotic[is.na(wide_df$robotic)] <- 0 # this is okay
	colnames(wide_df)[grepl('robotic$', names(wide_df))] <-
	paste("TOTAL", names(wide_df)[grepl('robotic$', names(wide_df))], sep = '_')

	wide_df$TOTAL_VOL <- wide_df$`TOTAL_non-robotic` + wide_df$TOTAL_robotic
	wide_df$TOTAL_PROP <- wide_df$TOTAL_robotic / wide_df$TOTAL_VOL

	rtn_cols <- c("oshpd_id", "year", "TOTAL_VOL", "TOTAL_robotic", "TOTAL_PROP")
	return(wide_df[rtn_cols])
	}
	aggegrateType <- function(type, df) {
	temp_df <- eval(parse(text = paste('count(df,', type, ', oshpd_id, year)')))
	colnames(temp_df)[1] <- type

	# remove NAs
	temp_df<- temp_df[!is.na(temp_df[1]),]

	typeFormula <- as.formula(paste('oshpd_id + year ~ ', type, sep = ''))
	wide_df <- dcast(temp_df, typeFormula, value.var = 'n')
	wide_df$robotic[is.na(wide_df$robotic)] <- 0 # this is okay
	colnames(wide_df)[grepl('robotic$', names(wide_df))] <-
	paste(type, names(wide_df)[grepl('robotic$', names(wide_df))], sep = '_')

	typeTotal <- cbind(wide_df[paste(type, "_non-robotic", sep = "")] +
	wide_df[paste(type, "_robotic", sep = "")])
	typeProp <- cbind(wide_df[paste(type, "_robotic", sep = "")] / typeTotal)
	newCols <- as.data.frame(cbind(typeTotal, typeProp))
	colnames(newCols) <- paste(type, c("_VOL", "_PROP"), sep = "")
	wide_df <- cbind(wide_df, newCols)

	rtn_cols <- c("oshpd_id", "year", paste(type, c("_VOL", "_robotic", "_PROP"), sep = ""))
	return(wide_df[rtn_cols])
	}

	mainDf <- aggegrateTotal(df, regex_table)
	dfLst <- lapply(regex_table$Name, aggegrateType, df = df)

	}