Last active
March 4, 2019 17:05
-
-
Save sinarueeger/02f31d41337e0ab77806d89f1e24ffdf to your computer and use it in GitHub Desktop.
satRday-paris notes https://paris2019.satrdays.org/
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## ///////////////////////////////// | |
## satRday Paris notes | |
## (untidy) | |
## ///////////////////////////////// | |
## Lionel Henry: Programming in the tidyverse | |
## tidyverse (data analysis, data manipulation, data cleaning) <----> r-lib (production prgramming) | |
## dplyr, ggplot2, tidyr <----> vctrs, rland, devtools | |
## reprocibillity by few users <----> reusability by many users | |
## TIDYVERSE | |
## https://principles.tidyverse.org/ | |
## data (data masking), domain oriented (e.g. data cleaning), language like (verbs like mutate, arrange) | |
## rlang | |
## =tidyeval | |
## !! and !!! and enquo() | |
## requires new concepts | |
## delayed computation: loops or functions | |
## datamasking is not transitive! | |
## options | |
## - fixed columns > no problem | |
## - mapping with purrr or *_all, *_if, *_at | |
## - tidyeval | |
## tidyeval: | |
## 1) pass the dots | |
library(margittr) | |
library(dplyr) | |
## group_by_at | |
my_count_by <- function(data, ...) | |
{ | |
data %>% group_by_at(vars(...)) %>% summarize(n = n()) | |
} | |
my_count_by(data = fivethirtyeight::bechdel, c("year", "binary")) | |
mean_freq <- function(data, ...) | |
{ | |
data %>% summarize_at(vars(contains("alcohol")), mean) | |
} | |
mean_freq(data = fivethirtyeight::drug_use, c("alcohol")) | |
## ! select is the only verb that does a selection and not an action (adding a new column) | |
## selections have special properties: c(), - and : | |
## group_by also creates actions | |
## if you want selections at group_by, use group_by_at | |
my_wrap <- function(...) { | |
facet_wrap(vars(...), labeller = label_both) | |
} | |
ggplot(data = fivethirtyeight::bechdel) + | |
geom_point(aes(year, budget_2013, color = binary)) + | |
my_wrap(Budget = cut_number(intgross_2013, 3)) + | |
hrbrthemes::theme_ipsum() | |
## 2) subsetting .data | |
## use .data as a pronoun , e.g. %>% group_by(.data$gender) | |
group_by_summarise <- function(.data, var1, var2) | |
{ | |
.data %>% group_by(.data[[var1]]) %>% summarise(mean = mean(.data[[var2]], na.rm = TRUE)) | |
} | |
group_by_summarise(bechdel, "binary", "domgross") | |
fivethirtyeight::bechdel %>% group_by_summarise("binary", "domgross") | |
## 3) interpolation = tidyeval | |
## delay a blueprint with enquo() and insert it back with !! | |
## e.g. group_by(!!enquo(var1) %>% summarise(!!enquo(var2)) | |
## example | |
# | |
group_by_summarise <- function(.data, var1, var2) | |
{ | |
.data %>% group_by(!!enquo(var1)) %>% summarise(mean = mean(!!enquo(var2), na.rm = TRUE)) | |
} | |
group_by_summarise(bechdel, binary, domgross) | |
fivethirtyeight::bechdel %>% group_by_summarise(binary, domgross) | |
## | |
drug <- fivethirtyeight::drug_use %>% select_at(vars("age", contains("use"))) %>% tidyr::gather(drug, use, -age) | |
heavy <- drug %>% group_by(drug) %>% summarize(max_larger_20 = max(use, na.rm = TRUE) > 20) | |
drug <- drug %>% full_join(heavy) | |
ggplot(data = drug) + | |
geom_path(aes(age, use, color = drug, group = drug)) + | |
facet_wrap(~max_larger_20, labeller = label_both) | |
ggplot(data = drug) + | |
geom_path(aes(age, use, color = drug, group = drug)) + | |
facet_wrap(~drug, scales = "free", labeller = label_both) + | |
theme_ipsum() | |
## https://twitter.com/dreamRs_fr | |
## etienne sanchez | |
## @fanny and @victor from dreamRs show the | |
## shiny application making job search more fun with | |
## - text mining | |
## - hierarchical clustering | |
## - shiny application | |
## bea: football and graph | |
## graph | |
## cypher is a syntax for pattern recognition , not like SQL | |
## Henrik Bengtson | |
## ------------------------------------- | |
## lapply | |
## parallel : not working for windows | |
## foreach: not perfect either | |
## ??? | |
## large data | |
## actions done | |
library(future) | |
plan(multiprocess) | |
fa <- future(sum(1:50)) | |
fa | |
fb <- future(sum(51:100)) | |
value(fb) | |
fa %<-% prod(1:1e2) ##%<-% ## future syntax | |
fa | |
## microbenchmarking | |
plan(sequential) | |
#system.time(value(future(sum(1:1e2)))) | |
system.time(x %<-% sum(1:1e6)) | |
plan(multiprocess) | |
system.time(x %<-% sum(1:1e6)) | |
## custom function | |
slow_sum <- function(x) | |
{ | |
SUM <- 0 | |
for (i in x) | |
{ | |
SUM <- SUM + i | |
Sys.sleep(0.5) | |
} | |
return(SUM) | |
} | |
library(future) | |
plan(multicore) | |
fa <- future(slow_sum(1:50)) | |
value(fa) | |
resolved(fa) | |
pryr::ast | |
resolved() | |
## other options | |
future_lapply | |
furrr::future_map | |
#- how to use bash tools within R: wrapper | |
#- distributing data to nodes | |
#- using scheduler without script | |
## Suzanne Baert's theme --------------------------------------------------------------- | |
# points semitransparent | |
library(ggplot2) | |
## nice: theme_linedraw, theme_minimal | |
x <- c("theme_ipsum", "theme_ipsum_rc") | |
for (THEME in x) | |
{ | |
qp <- ggplot(data = fivethirtyeight::bechdel) + | |
geom_point(aes(domgross, intgross, color = binary)) + | |
my_wrap(c(budget = cut_number(budget, 3))) + | |
eval(call(THEME)) + labs( | |
title="Seminal ggplot2 scatterplot example", | |
subtitle="A plot that is only useful for demonstration purposes", | |
caption="Brought to you by the letter 'g'") | |
print(qp) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment