Created
December 3, 2015 04:14
-
-
Save laurakwiley/68bac9a736ccd7955b35 to your computer and use it in GitHub Desktop.
A simple way to group timestamped data by an arbitrary maximum amount of time between events. Inspired by this (http://stackoverflow.com/a/32248462) Stack Overflow answer.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load helpful packages | |
library(lubridate) | |
library(dplyr) | |
# Sample Data | |
data <- structure(list(X1 = c("kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov","kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "atl1.america.net", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "atl1.america.net", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "atl1.america.net", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov", "kenneth.lerc.nasa.gov"), | |
X2 = structure(c(807272161, 807892754, 807892821, 807892978, 807893004, 807956939, 807957024, 807957075, 807958536, 807958583, 807958612, 807958613, 808017688, 808151728, 808151755, 808151885, 808151897, 808151902, 808151914, 808151930, 808151934, 808151942, 808151950, 808151966, 808151987, 808152076, 808152077, 808152211, 808152240, 808152278, 808443246, 808647297, 808647305, 808648062, 808648062, 808648067, 808648291, 809002676, 809002681, 809002692, 809002794, 809518045, 809518054, 809518076, 809518084, 809518165, 809518178, 809518190, 809518198, 809518935, 809519059, 809519138, 809519152, 809519192, 809519202, 809519310, 809519321, 809519331, 809519383, 809519400, 809519623, 809519677, 809519774, 809519784, 809519837, 809606867, 809606878, 809606884, 809618574, 809618579, 809618584, 809693098, 809770817, 809770822, 809770827, 809770868, 809771523), | |
tzone = "UTC", | |
class = c("POSIXct", "POSIXt"))), | |
class = c("tbl_df", "tbl", "data.frame"), | |
row.names = c(NA, -77L), | |
.Names = c("X1", "X2")) | |
# What this code does is: | |
# 1. Group by some variable (in this case website, in your case patient ID) | |
# 2. Create "diff_time" which is the current timestamp minus the previous (lag) timestamp - there is a bug right now that isn't letting me set the lag() default as minimum timestamp which means we have to deal with the first diff_time being NA. | |
# 3. Create a flag variable that puts the number 1 everywhere that the diff_time is larger than the time interval you desire - here it is 2 minutes. If the original time interval was NA set it equal to 1 | |
# 4. Assign Session ID based on the cumulative sum of the flag variables | |
data %>% | |
group_by(X1) %>% | |
mutate(diff_time = X2 - lag(X2, order_by = X2), | |
flag = ifelse(is.na(diff_time), | |
yes = 1, | |
no = ifelse(diff_time > seconds(120), | |
yes = 1, | |
no = 0)), | |
session_id = cumsum(flag)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Ok in a different version of dplyr this code works:
NOTE: You must have the arrange step for the cumsum() to work appropriately!!!!!!!!!!!!!!!!