Created
December 5, 2015 21:33
-
-
Save laurakwiley/4bfc2a657d57280b0924 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(magrittr) | |
library(tidyr) | |
library(dplyr) | |
library(ggplot2) | |
data <- data.frame(PT_ID = c(23, 23, 23, 23, 100, 100, 100, 100), | |
DATETIME = c("1/1/2013 9:38", "8/9/2014 15:42", "1/1/2013 9:38", "8/9/2014 15:42", "12/21/2011 12:10", "8/9/2013 11:18", "12/21/2011 12:10", "8/9/2013 11:18"), | |
MoCA_total = c("MHSMOCA:MOCATOTAL = 25", "MHSMOCA:MOCATOTAL = 22", NA, NA, "MHSMOCA:MOCATOTAL = 21", "MHSMOCA:MOCATOTAL = 25", NA, NA), | |
Total_Correct = c(NA, NA, "MHSWLM:CORRECT = 15", "MHSWLM:CORRECT = 20", NA, NA, "MHSWLM:CORRECT = 18", "MHSWLM:CORRECT = 24")) %>% tbl_df() | |
# Clean up datatypes | |
data %<>% | |
mutate(DATETIME = mdy_hm(DATETIME), | |
MoCA_total = as.character(MoCA_total), | |
Total_Correct = as.character(Total_Correct)) | |
# Process MoCA and MHSWLM to extract the number for the test and remove the test name since it's already in the column ID | |
data %<>% | |
separate(col = MoCA_total, into = c("text1", "MoCA_total"), sep = "=", convert = TRUE) %>% | |
separate(col = Total_Correct, into = c("text2", "Total_Correct"), sep = "=", convert = TRUE) %>% | |
select(-text1, -text2) | |
# Now make this a tidy data frame (e.g. one line for each observation) | |
# Gather makes a key column ("test") that contains the column names you select and a value column ("score") that contains the values from that column | |
# Here is missing data is uninformative so we use na.rm=TRUE to get rid of the empty data. | |
data %<>% | |
gather(key = "test", value = "score", MoCA_total:Total_Correct, na.rm = TRUE) | |
# Now assign visit numbers for each person and test | |
# As written this will give you 1:total number of visits w/ that lab test as the visit id. (e.g. if patient 23 had 4 MoCA_total tests, the visit id would go from 1:4) | |
# If you really want to create pairs of tests you can replace the "1:length(PT_ID)" with "rep(c(1,2), times = length(PT_ID)/2)" This would result in visit id being 1,2,1,2 -- fair warning, this will cause problems down the analysis pipeline I'm building. | |
data %<>% | |
group_by(PT_ID, test) %>% | |
arrange(DATETIME) %>% | |
mutate(visit_id = 1:length(PT_ID)) | |
# Now let's also make a value for the difference between visits. There are a few ways to do this, but I'm going to do the easiest conceptually, but least elegant way. | |
data %>% | |
select(-DATETIME) %>% # For this pipeline the different DATETIMES will cause problems. Note that I'm creating the entire pipeline in one go and *not* returning it into the original data object - this leaves the underlying data untouched. | |
unite(col = test_visit, sep = ".", test, visit_id) %>% ## This joins the test and visit column so each test is unique for a patient | |
spread(key = test_visit, value = score) %>% # This turns the long data wide | |
mutate(MoCA_total.diff = MoCA_total.2 - MoCA_total.1, | |
Total_Correct.diff = Total_Correct.2 - Total_Correct.1) %>% # We calculate our differences | |
gather(key = test_visit, value = score, MoCA_total.1:Total_Correct.diff) %>% # And bring our wide data long again | |
separate(col = test_visit, into = c("test", "visit"), sep = "\\.") %>% # Separate out test and visit labels again | |
ggplot() + # plot our histograms, score on the x axis then split by test and visit/difference If you want these separate you can use filtering before plotting. | |
geom_histogram(aes(x = score))+ | |
facet_grid(visit~test) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment