Created
May 15, 2018 18:55
-
-
Save saptarshiguha/8d7ed27c22d7bee48591e6cf48b84317 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
--- | |
title: Have we missed profiles on ESR and Linux? | |
author: Saptarshi Guha <[email protected]> | |
date: "`r format(Sys.time(), '%H:%M %B %d, %Y',tz='America/Los_Angeles',usetz=TRUE)`" | |
output: | |
html_document: | |
mathjax: default | |
self_contained: false | |
theme: readable | |
highlight: haddock | |
number_sections: true | |
toc_depth: 5 | |
toc: true | |
--- | |
```{r child = '/home/sguha/mz/template2.Rmd'} | |
``` | |
```{r options, echo=FALSE} | |
knitr::knit_engines$set(python = reticulate::eng_python) | |
``` | |
# Background | |
For much more informed background speak with nanj on slack.Briefly, Activity | |
Stream sent a daily API call that didn't respect the privacy settings of the | |
browser. This call was sent daily and once to our servers for versions 40 ... 59 | |
inclusive. See | |
[this](https://sql.telemetry.mozilla.org/queries/52620/source#140092) for an | |
example query. | |
On ESR, companies often turn off telemetry (which is our way of getting profile | |
counts) and Linux distros customize to their taste which often includes turning | |
of telemetry. | |
With that said, i don't see any reason why the blocklist ping be turned of (since | |
this has no client information, cannot be used to track and protects the | |
browser). Hence the AS ping ought to be similar to the blocklist ping . However, | |
given this is an assumption, I'm inclined to believe the AS ping as the true ESR | |
and Linux number assuming it roughly matches the block list/submission_date based | |
daily counts on other OS/Channels. | |
# Getting Data | |
## AS Data | |
I used [this](https://sql.telemetry.mozilla.org/queries/53315/source) query to | |
get the AS counts by version, os and date. | |
```{r as1, cache=TRUE} | |
as <- fread("New_Query_2018_05_14.csv",head=TRUE) | |
setnames(as,c("date","version","os","asi")) | |
as <- as[, isesr:=grepl("^(52|45)", version)][,] | |
as <- as[, os2:=sapply(os,function(s){ | |
if(grepl("Windows",s)) "Windows" | |
else if(s %in% c("Arch Linux","BSD","CentOS","Debian","Fedora", | |
"FreeBSD","Gentoo","Kubuntu","Linux","Linux Mint","Mandriva", | |
"NetBSD","OpenBSD","openSUSE","PCLinuxOS","Red Hat","Slackware", | |
"Solaris","SUSE","Ubuntu")) "Linux" | |
else if(s %in% c("Mac OS","Mac OS X")) "OSX" | |
else "Others" | |
})][,] | |
as2 <- as[grepl("^(40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59)\\.",version)==TRUE,] | |
as2 <- as2[,ver2:=substring(version,1,2)][,] | |
as2 <- as2[, asi:=1*asi][,] | |
d1 <- strftime(as.Date(min(as2$date)), "%Y%m%d") | |
d2 <- strftime(as.Date(max(as2$date)),"%Y%m%d") | |
``` | |
## Blocklist Count | |
```{r as2, cache=TRUE,eval=TRUE} | |
d <- odbc() | |
adi <- data.table(d$q(sprintf("select | |
bl_date as date, | |
prod_os as os, | |
v_prod_major as version, | |
sum(tot_requests_on_date) as adi | |
from | |
copy_adi_dimensional_by_date | |
where bl_date >='%s' and bl_date <='%s' and product='Firefox' | |
group by 1,2,3 | |
order by 1,2,3 | |
",min(as2$date),max(as2$date)))) | |
adi <- adi[, os2:=sapply(os,function(s){ | |
if(grepl("Windows",s)) "Windows" | |
else if(s %in% c("Arch Linux","BSD","CentOS","Debian","Fedora", | |
"FreeBSD","Gentoo","Kubuntu","Linux","Linux Mint","Mandriva", | |
"NetBSD","OpenBSD","openSUSE","PCLinuxOS","Red Hat","Slackware","SunOS", | |
"Solaris","SUSE","Ubuntu")) "Linux" | |
else if(s %in% c("Darwin","Mac OS","Mac OS X")) "OSX" | |
else "Others" | |
})][,] | |
adi2 <- adi[grepl("^(40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59)\\.",version)==TRUE,] | |
adi2 <- adi2[,ver2:=substring(version,1,2)][,] | |
``` | |
## DAU count based of `submission_date_s3` | |
```{pydbx as3, cache=TRUE, dependson='as1',storein='dau'} | |
dau=spark.sql(""" | |
select | |
submission_date_s3, | |
app_version as version, | |
os as os, | |
count(distinct(client_id))*100 as dau | |
from main_summary | |
where app_name='Firefox' | |
and sample_id='42' | |
and submission_date_s3>='(__REPLACE__d1)' and submission_date_s3<='(__REPLACE__d2)' | |
group by 1,2,3 order by 1,2,3 | |
""").toPandas() | |
``` | |
## Munging | |
```{r as4, cache=TRUE, dependson=c('as1','as2','as3')} | |
dau <- dau[, os2:=sapply(os,function(s){ | |
if(grepl("Windows",s)) "Windows" | |
else if(s %in% c("Arch Linux","BSD","CentOS","Debian","Fedora", | |
"FreeBSD","Gentoo","Kubuntu","Linux","Linux Mint","Mandriva", | |
"NetBSD","OpenBSD","openSUSE","PCLinuxOS","Red Hat","Slackware","SunOS", | |
"Solaris","SUSE","Ubuntu")) "Linux" | |
else if(s %in% c("Darwin","Mac OS","Mac OS X")) "OSX" | |
else "Others" | |
})][,] | |
dau2 <- dau[grepl("^(40|41|42|43|44|45|46|47|48|49|50|51|52|53|54|55|56|57|58|59)\\.",version)==TRUE,] | |
dau2 <- dau2[,ver2:=substring(version,1,2)][,] | |
dau2 <- dau2[, date:=as.character(as.Date(as.character(submission_date_s3),format="%Y%m%d"))][, submission_date_s3:=NULL][,] | |
adi3 <- adi2[,list(adi=sum(adi)), by=list(date, os2,ver2)] | |
as3 <- as2[, list(asi=sum(asi)), by=list(date,os2,ver2)] | |
dau3 <- dau2[,list(dau=sum(dau)), by=list(date,os2,ver2)] | |
ma <- merge(merge(as3,adi3, by=c("date","os2","ver2")),dau3,by=c("date","os2",'ver2')) | |
ma <- ma[, isesr:=grepl("^(52|45)", ver2)][,] | |
``` | |
# Analyses | |
## Definitions | |
- **asi** unique installations active on a day based off the AS ping | |
- **adi** unique installations active on a day based off the Blocklist ping | |
- **dau** unique installations active on a day based off Unified Telemetry | |
## (BASELINE) Compare Known Values (Windows Non ESR) | |
Windows,non ESR ADI and ASI track each other fairly well. ADI is larger. That | |
they track each other implies ASI is a consistent signal of volume, though the | |
number may not be entirely accurate. | |
```{r fig.width=10,cache=TRUE,fig.cap='Plot of ASI, ADI and DAU (7 day smoothed) (WINDOWS ONLY, NON-ESR)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[!isesr & os2=='Windows', list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
xyplot( sdau/1e6+sasi/1e6+sadi/1e6 ~ date, data=ma2,type=c('g','l'),auto.key=list(columns=3) | |
,scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
The differences are in the order of a few millions and increasing over time | |
(i.e. ADI is greater than ASI by a few million). Also the relative difference is | |
increasing time. | |
<div class = "row"> | |
<div class = "col-md-6"> | |
```{r cache=TRUE,fig.cap='Plot of ASI,ADI difference (WINDOWS, non ESR)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[!isesr & os2=='Windows', list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
ma2 <- ma2[, reldiff:= (sadi - sasi)] | |
xyplot( reldiff/1e6~ date, data=ma2,type=c('g','l','r'),ylab='Difference\n(ADI-ASI) mm', | |
scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
</div> | |
<div class = "col-md-6"> | |
```{r cache=TRUE,fig.cap='Plot of ASI,ADI relative difference %(WINDOWS, non ESR)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[!isesr & os2=='Windows', list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
ma2 <- ma2[, reldiff:= (sadi - sasi)/sasi * 100] | |
xyplot( reldiff~ date, data=ma2,type=c('g','l','r'),ylab='Relative Difference\n(ADI-ASI)/ASI %', | |
scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
</div> | |
</div> | |
## Compare ESR (all OS) | |
Plot of ESR volumes. Based on the veracity of the above figures, ESR is not | |
being completely measured by current methods(ADI/Telemetry). | |
```{r fig.width=10,cache=TRUE,fig.cap='Plot of ASI, ADI and DAU (7 day smoothed) (ESR)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[isesr==TRUE, list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
xyplot( sdau/1e6+sasi/1e6+sadi/1e6 ~ date, data=ma2,type=c('g','l'),auto.key=list(columns=3) | |
,scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
The difference between ADI and ASI (ASI-ADI) is increasing and is in the order | |
of several millions. The discrepancy is large: for Windows, Non ESR there is a | |
6-8% relative difference between ADI and ASI. For ESR, there is a 45-55% | |
relative difference!! | |
Looking at the trend, the discrepancy might even be getting larger over time. | |
**As opposed to the graphs above, we look at $\frac{ASI-ADI}{ASI}$ (not | |
$(ADI-ASI)$) since ASI for ESR/ Linux is higher than ADI** | |
<div class = "row"> | |
<div class = "col-md-6"> | |
```{r cache=TRUE,fig.cap='Plot of ASI,ADI difference (ESR)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[isesr==TRUE, list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
ma2 <- ma2[, reldiff:= (sasi - sadi)] | |
xyplot( reldiff/1e6~ date, data=ma2,type=c('g','l','r'),ylab='Difference\n(ASI-ADI) mm', | |
scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
</div> | |
<div class = "col-md-6"> | |
```{r cache=TRUE,fig.cap='Plot of ASI,ADI relative difference %(ESR)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[isesr==TRUE, list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
ma2 <- ma2[, reldiff:= (sasi - sadi)/sasi*100] | |
xyplot( reldiff~ date, data=ma2,type=c('g','l','r'),ylab='Difference\n(ASI-ADI)/ASI %', | |
scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
</div> | |
</div> | |
## Compare Linux | |
Plot of Linux (irrespective of ESR) volumes. Based on the veracity of the above figures, ESR is not | |
being completely measured by current methods(ADI/Telemetry). | |
**Cannot explain odd bump in March 2018** | |
```{r fig.width=10,cache=TRUE,fig.cap='Plot of ASI, ADI and DAU (7 day smoothed) (Linux)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[os2=='Linux', list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
xyplot( sdau/1e6+sasi/1e6+sadi/1e6 ~ date, data=ma2,type=c('g','l'),auto.key=list(columns=3) | |
,scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
The difference between ADI and ASI (ASI-ADI) is increasing and is in the order | |
of several millions. The discrepancy is large: for Windows, Non ESR there is a | |
6-8% relative difference between ADI and ASI. For Linux, there is a 70-90% | |
relative difference!! | |
Looking at the trend, the discrepancy might even be getting larger over time. | |
<div class = "row"> | |
<div class = "col-md-6"> | |
```{r cache=TRUE,fig.cap='Plot of ASI,ADI difference (Linux)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[os2=='Linux', list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
ma2 <- ma2[, reldiff:= (sasi - sadi)] | |
xyplot( reldiff/1e6~ date, data=ma2,type=c('g','l','r'),ylab='Difference\n(ASI-ADI) mm', | |
scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
</div> | |
<div class = "col-md-6"> | |
```{r cache=TRUE,fig.cap='Plot of ASI,ADI relative difference %(Linux)',echo=FALSE} | |
ma <- ma[, date:=as.Date(date)][,] | |
ma2 <- ma[os2=='Linux', list(adi=sum(adi),asi=sum(asi),dau=sum(dau)),by=list(date)] | |
ma2 <- ma2[,":="(sasi=filter(asi, rep(1,7)/7, sides=1), | |
sadi=filter(adi, rep(1,7)/7, side=1), | |
sdau=filter(dau, rep(1,7)/7, side=1))][,] | |
ma2 <- ma2[, reldiff:= (sasi - sadi)/sasi*100] | |
xyplot( reldiff~ date, data=ma2,type=c('g','l','r'),ylab='Difference\n(ASI-ADI)/ASI %', | |
scales=list(x=list(format='%Y-%m-%d',rot=45))) | |
``` | |
</div> | |
</div> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment