sinarueeger · March 4, 2019 17:05
diff --git a/satrday-paris.R b/satrday-paris.R
 ## /////////////////////////////////
 ##       satRday Paris notes 
 ##            (untidy)
 ## /////////////////////////////////

 ## Lionel Henry: Programming in the tidyverse
 ## tidyverse (data analysis, data manipulation, data cleaning) <----> r-lib (production prgramming)
 ##                                       dplyr, ggplot2, tidyr <----> vctrs, rland, devtools
 ## reprocibillity by few users                                 <----> reusability by many users

 ## TIDYVERSE
 ## https://principles.tidyverse.org/
 ## data (data masking), domain oriented (e.g. data cleaning), language like (verbs like mutate, arrange)

 ## rlang
 ## =tidyeval
 ## !! and !!! and enquo()
 ## requires new concepts
 ## delayed computation: loops or functions
 ## datamasking is not transitive! 

 ## options
 ## - fixed columns > no problem
 ## - mapping with purrr or *_all, *_if, *_at
 ## - tidyeval

 ## tidyeval:
 ## 1) pass the dots

 library(margittr)
 library(dplyr)

 ## group_by_at
 my_count_by <- function(data, ...)
 {
  data %>% group_by_at(vars(...)) %>% summarize(n = n())
 }

 my_count_by(data = fivethirtyeight::bechdel, c("year", "binary"))


 mean_freq <- function(data, ...)
 {
  data %>% summarize_at(vars(contains("alcohol")), mean)
 }

 mean_freq(data = fivethirtyeight::drug_use, c("alcohol"))


 ## ! select is the only verb that does a selection and not an action (adding a new column)
 ## selections have special properties: c(), - and :
 ## group_by also creates actions
 ## if you want selections at group_by, use group_by_at

 my_wrap <- function(...) {
  
  facet_wrap(vars(...), labeller = label_both)
 }


 ggplot(data = fivethirtyeight::bechdel) + 
  geom_point(aes(year, budget_2013, color = binary)) + 
  my_wrap(Budget = cut_number(intgross_2013, 3)) + 
  hrbrthemes::theme_ipsum()
  
 ## 2) subsetting .data
 ## use .data as a pronoun , e.g. %>% group_by(.data$gender)

 group_by_summarise <- function(.data, var1, var2)
 {
 .data %>% group_by(.data[[var1]]) %>% summarise(mean = mean(.data[[var2]], na.rm = TRUE))
 }

 group_by_summarise(bechdel, "binary", "domgross")

 fivethirtyeight::bechdel %>% group_by_summarise("binary", "domgross")


 ## 3) interpolation = tidyeval
 ## delay a blueprint with enquo() and insert it back with !! 
 ## e.g.   group_by(!!enquo(var1) %>% summarise(!!enquo(var2))

 ## example

 #
 group_by_summarise <- function(.data, var1, var2)
 {
  .data %>% group_by(!!enquo(var1)) %>% summarise(mean = mean(!!enquo(var2), na.rm = TRUE))
 }

 group_by_summarise(bechdel, binary, domgross)

 fivethirtyeight::bechdel %>% group_by_summarise(binary, domgross)


 ## 
 drug <- fivethirtyeight::drug_use %>% select_at(vars("age", contains("use"))) %>% tidyr::gather(drug, use, -age)
 heavy <- drug %>% group_by(drug) %>% summarize(max_larger_20 = max(use, na.rm = TRUE) > 20)
 drug <- drug %>% full_join(heavy)

 ggplot(data = drug) + 
  geom_path(aes(age, use, color = drug, group = drug)) + 
  facet_wrap(~max_larger_20, labeller = label_both)

 ggplot(data = drug) + 
  geom_path(aes(age, use, color = drug, group = drug)) + 
  facet_wrap(~drug, scales = "free", labeller = label_both) + 
  theme_ipsum()


 ## https://twitter.com/dreamRs_fr
 ## etienne sanchez
 ## @fanny and @victor from dreamRs show the 
 ## shiny application making job search more fun with
 ## - text mining
 ## - hierarchical clustering
 ## - shiny application


 ## bea: football and graph

 ## graph
 ## cypher is a syntax for pattern recognition , not like SQL




 ## Henrik Bengtson
 ## -------------------------------------
 ## lapply
 ## parallel : not working for windows
 ## foreach: not perfect either

 ## ???
 ## large data
 ## actions done

 library(future)
 plan(multiprocess)
 fa <- future(sum(1:50))
 fa

 fb <- future(sum(51:100))
 value(fb)

 fa %<-% prod(1:1e2) ##%<-% ## future syntax
 fa

 ## microbenchmarking

 plan(sequential)
 #system.time(value(future(sum(1:1e2))))
 system.time(x %<-% sum(1:1e6))

 plan(multiprocess)
 system.time(x %<-% sum(1:1e6))


 ## custom function

 slow_sum <- function(x)
 {
  SUM <- 0
  for (i in x)
  {
    SUM <- SUM + i
    Sys.sleep(0.5)  
  }
  return(SUM)
  
 }

 library(future)
 plan(multicore)
 fa <- future(slow_sum(1:50))
 value(fa)
 resolved(fa)








 pryr::ast
 resolved()

 ## other options
 future_lapply
 furrr::future_map


 #- how to use bash tools within R: wrapper
 #- distributing data to nodes 
 #- using scheduler without script



 ## Suzanne Baert's theme ---------------------------------------------------------------
 # points semitransparent
 library(ggplot2)
 ## nice: theme_linedraw, theme_minimal

 x <- c("theme_ipsum", "theme_ipsum_rc")

 for (THEME in x)
 {
  
  qp <- ggplot(data = fivethirtyeight::bechdel) + 
    geom_point(aes(domgross, intgross, color = binary)) + 
    my_wrap(c(budget = cut_number(budget, 3))) + 
    eval(call(THEME)) + labs(
      title="Seminal ggplot2 scatterplot example",
      subtitle="A plot that is only useful for demonstration purposes",
      caption="Brought to you by the letter 'g'") 
  print(qp)
 }
	## /////////////////////////////////
	## satRday Paris notes
	## (untidy)
	## /////////////////////////////////

	## Lionel Henry: Programming in the tidyverse
	## tidyverse (data analysis, data manipulation, data cleaning) <----> r-lib (production prgramming)
	## dplyr, ggplot2, tidyr <----> vctrs, rland, devtools
	## reprocibillity by few users <----> reusability by many users

	## TIDYVERSE
	## https://principles.tidyverse.org/
	## data (data masking), domain oriented (e.g. data cleaning), language like (verbs like mutate, arrange)

	## rlang
	## =tidyeval
	## !! and !!! and enquo()
	## requires new concepts
	## delayed computation: loops or functions
	## datamasking is not transitive!

	## options
	## - fixed columns > no problem
	## - mapping with purrr or _all, _if, *_at
	## - tidyeval

	## tidyeval:
	## 1) pass the dots

	library(margittr)
	library(dplyr)

	## group_by_at
	my_count_by <- function(data, ...)
	{
	data %>% group_by_at(vars(...)) %>% summarize(n = n())
	}

	my_count_by(data = fivethirtyeight::bechdel, c("year", "binary"))


	mean_freq <- function(data, ...)
	{
	data %>% summarize_at(vars(contains("alcohol")), mean)
	}

	mean_freq(data = fivethirtyeight::drug_use, c("alcohol"))


	## ! select is the only verb that does a selection and not an action (adding a new column)
	## selections have special properties: c(), - and :
	## group_by also creates actions
	## if you want selections at group_by, use group_by_at

	my_wrap <- function(...) {

	facet_wrap(vars(...), labeller = label_both)
	}


	ggplot(data = fivethirtyeight::bechdel) +
	geom_point(aes(year, budget_2013, color = binary)) +
	my_wrap(Budget = cut_number(intgross_2013, 3)) +
	hrbrthemes::theme_ipsum()

	## 2) subsetting .data
	## use .data as a pronoun , e.g. %>% group_by(.data$gender)

	group_by_summarise <- function(.data, var1, var2)
	{
	.data %>% group_by(.data[[var1]]) %>% summarise(mean = mean(.data[[var2]], na.rm = TRUE))
	}

	group_by_summarise(bechdel, "binary", "domgross")

	fivethirtyeight::bechdel %>% group_by_summarise("binary", "domgross")


	## 3) interpolation = tidyeval
	## delay a blueprint with enquo() and insert it back with !!
	## e.g. group_by(!!enquo(var1) %>% summarise(!!enquo(var2))

	## example

	#
	group_by_summarise <- function(.data, var1, var2)
	{
	.data %>% group_by(!!enquo(var1)) %>% summarise(mean = mean(!!enquo(var2), na.rm = TRUE))
	}

	group_by_summarise(bechdel, binary, domgross)

	fivethirtyeight::bechdel %>% group_by_summarise(binary, domgross)


	##
	drug <- fivethirtyeight::drug_use %>% select_at(vars("age", contains("use"))) %>% tidyr::gather(drug, use, -age)
	heavy <- drug %>% group_by(drug) %>% summarize(max_larger_20 = max(use, na.rm = TRUE) > 20)
	drug <- drug %>% full_join(heavy)

	ggplot(data = drug) +
	geom_path(aes(age, use, color = drug, group = drug)) +
	facet_wrap(~max_larger_20, labeller = label_both)

	ggplot(data = drug) +
	geom_path(aes(age, use, color = drug, group = drug)) +
	facet_wrap(~drug, scales = "free", labeller = label_both) +
	theme_ipsum()


	## https://twitter.com/dreamRs_fr
	## etienne sanchez
	## @fanny and @victor from dreamRs show the
	## shiny application making job search more fun with
	## - text mining
	## - hierarchical clustering
	## - shiny application


	## bea: football and graph

	## graph
	## cypher is a syntax for pattern recognition , not like SQL




	## Henrik Bengtson
	## -------------------------------------
	## lapply
	## parallel : not working for windows
	## foreach: not perfect either

	## ???
	## large data
	## actions done

	library(future)
	plan(multiprocess)
	fa <- future(sum(1:50))
	fa

	fb <- future(sum(51:100))
	value(fb)

	fa %<-% prod(1:1e2) ##%<-% ## future syntax
	fa

	## microbenchmarking

	plan(sequential)
	#system.time(value(future(sum(1:1e2))))
	system.time(x %<-% sum(1:1e6))

	plan(multiprocess)
	system.time(x %<-% sum(1:1e6))


	## custom function

	slow_sum <- function(x)
	{
	SUM <- 0
	for (i in x)
	{
	SUM <- SUM + i
	Sys.sleep(0.5)
	}
	return(SUM)

	}

	library(future)
	plan(multicore)
	fa <- future(slow_sum(1:50))
	value(fa)
	resolved(fa)








	pryr::ast
	resolved()

	## other options
	future_lapply
	furrr::future_map


	#- how to use bash tools within R: wrapper
	#- distributing data to nodes
	#- using scheduler without script



	## Suzanne Baert's theme ---------------------------------------------------------------
	# points semitransparent
	library(ggplot2)
	## nice: theme_linedraw, theme_minimal

	x <- c("theme_ipsum", "theme_ipsum_rc")

	for (THEME in x)
	{

	qp <- ggplot(data = fivethirtyeight::bechdel) +
	geom_point(aes(domgross, intgross, color = binary)) +
	my_wrap(c(budget = cut_number(budget, 3))) +
	eval(call(THEME)) + labs(
	title="Seminal ggplot2 scatterplot example",
	subtitle="A plot that is only useful for demonstration purposes",
	caption="Brought to you by the letter 'g'")
	print(qp)
	}