Last active
February 24, 2017 11:48
-
-
Save georgf/94ca77fe6174ec07077504b24379932a to your computer and use it in GitHub Desktop.
histogram-empty-key-counts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Find histograms with empty keys" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Find histograms with empty keys" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false, | |
"scrolled": true | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
}, | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"/home/hadoop/anaconda2/lib/python2.7/site-packages/IPython/core/magics/pylab.py:161: UserWarning:\n", | |
"\n", | |
"pylab import has clobbered these variables: ['Figure', 'Annotation']\n", | |
"`%matplotlib` prevents importing * from pylab and numpy\n", | |
"\n" | |
] | |
} | |
], | |
"source": [ | |
"import ujson as json\n", | |
"import matplotlib.pyplot as plt\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"import plotly.plotly as py\n", | |
"from plotly.graph_objs import *\n", | |
"import IPython\n", | |
"\n", | |
"from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client\n", | |
"from pprint import pprint\n", | |
"\n", | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"channels = [\"nightly\", \"aurora\", \"beta\", \"release\"]\n", | |
"submission_dates = (\"20170122\", \"20170222\")\n", | |
"fractions = {\n", | |
" \"nightly\": 0.1,\n", | |
" \"aurora\": 0.1,\n", | |
" \"beta\": 0.1,\n", | |
" \"release\": 0.003\n", | |
"}\n", | |
"pings = {}\n", | |
"\n", | |
"for c in channels:\n", | |
" pings[c] = get_pings(sc,\n", | |
" app=\"Firefox\",\n", | |
" channel=c,\n", | |
" doc_type=\"main\",\n", | |
" fraction=fractions[c],\n", | |
" submission_date=submission_dates)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings)." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"def get_keyed_histograms(p):\n", | |
" if not isinstance(p, dict) or \\\n", | |
" \"payload\" not in p or \\\n", | |
" not isinstance(p[\"payload\"], dict) or \\\n", | |
" \"keyedHistograms\" not in p[\"payload\"] or \\\n", | |
" not isinstance(p[\"payload\"][\"keyedHistograms\"], dict):\n", | |
" return {}\n", | |
" return p.get(\"payload\", {}).get(\"keyedHistograms\", {})\n", | |
"\n", | |
"# This extracts the keyed histograms names which have an empty key string.\n", | |
"def extract_affected_histograms(p):\n", | |
" khs = get_keyed_histograms(p)\n", | |
" names = [name for name,kh in khs.iteritems() if \"\" in kh]\n", | |
" return names\n", | |
"\n", | |
"extracts = {}\n", | |
"\n", | |
"for c,ps in pings.iteritems():\n", | |
" extracts[c] = ps.flatMap(extract_affected_histograms)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Let's get sorted lists of the hit counts per channel." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"nameCounts = {}\n", | |
"for channel,names in extracts.iteritems():\n", | |
" counts = names.countByValue()\n", | |
" nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"nightly\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th># of hits in nightly</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n", | |
" <td>794</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>URLCLASSIFIER_UPDATE_REMOTE_STATUS2</th>\n", | |
" <td>104</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n", | |
" <td>76</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n", | |
" <td>16</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" # of hits in nightly\n", | |
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 794\n", | |
"URLCLASSIFIER_UPDATE_REMOTE_STATUS2 104\n", | |
"CANVAS_WEBGL_ACCL_FAILURE_ID 76\n", | |
"CANVAS_WEBGL_FAILURE_ID 16" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"aurora\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th># of hits in aurora</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n", | |
" <td>42422</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n", | |
" <td>75</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n", | |
" <td>6</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" # of hits in aurora\n", | |
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 42422\n", | |
"CANVAS_WEBGL_ACCL_FAILURE_ID 75\n", | |
"CANVAS_WEBGL_FAILURE_ID 6" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"beta\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th># of hits in beta</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n", | |
" <td>75355</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n", | |
" <td>53192</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n", | |
" <td>596</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>FX_MIGRATION_ERRORS</th>\n", | |
" <td>2</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" # of hits in beta\n", | |
"CANVAS_WEBGL_ACCL_FAILURE_ID 75355\n", | |
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 53192\n", | |
"CANVAS_WEBGL_FAILURE_ID 596\n", | |
"FX_MIGRATION_ERRORS 2" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"\n", | |
"release\n", | |
"\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th># of hits in release</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED</th>\n", | |
" <td>879</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_ACCL_FAILURE_ID</th>\n", | |
" <td>396</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>CANVAS_WEBGL_FAILURE_ID</th>\n", | |
" <td>7</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" # of hits in release\n", | |
"DEVTOOLS_JAVASCRIPT_ERROR_DISPLAYED 879\n", | |
"CANVAS_WEBGL_ACCL_FAILURE_ID 396\n", | |
"CANVAS_WEBGL_FAILURE_ID 7" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"for channel in channels:\n", | |
" df = pd.DataFrame([x for _,x in nameCounts[channel]],\n", | |
" [x for x,_ in nameCounts[channel]])\n", | |
" print \"\\n\" + channel + \"\\n\"\n", | |
" df.columns = [\"# of hits in \" + channel]\n", | |
" IPython.display.display(df)" | |
] | |
} | |
], | |
"metadata": { | |
"anaconda-cloud": {}, | |
"kernelspec": { | |
"display_name": "Python [default]", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.12" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Find histograms with empty keys | |
# ### Find histograms with empty keys | |
# In[7]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
from plotly.graph_objs import * | |
import IPython | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client | |
from pprint import pprint | |
get_ipython().magic(u'pylab inline') | |
# In[8]: | |
channels = ["nightly", "aurora", "beta", "release"] | |
submission_dates = ("20170122", "20170222") | |
fractions = { | |
"nightly": 0.1, | |
"aurora": 0.1, | |
"beta": 0.1, | |
"release": 0.003 | |
} | |
pings = {} | |
for c in channels: | |
pings[c] = get_pings(sc, | |
app="Firefox", | |
channel=c, | |
doc_type="main", | |
fraction=fractions[c], | |
submission_date=submission_dates) | |
# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings). | |
# In[9]: | |
def get_keyed_histograms(p): | |
if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict): | |
return {} | |
return p.get("payload", {}).get("keyedHistograms", {}) | |
# This extracts the keyed histograms names which have an empty key string. | |
def extract_affected_histograms(p): | |
khs = get_keyed_histograms(p) | |
names = [name for name,kh in khs.iteritems() if "" in kh] | |
return names | |
extracts = {} | |
for c,ps in pings.iteritems(): | |
extracts[c] = ps.flatMap(extract_affected_histograms) | |
# Let's get sorted lists of the hit counts per channel. | |
# In[10]: | |
nameCounts = {} | |
for channel,names in extracts.iteritems(): | |
counts = names.countByValue() | |
nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True) | |
# In[12]: | |
for channel in channels: | |
df = pd.DataFrame([x for _,x in nameCounts[channel]], | |
[x for x,_ in nameCounts[channel]]) | |
print "\n" + channel + "\n" | |
df.columns = ["# of hits in " + channel] | |
IPython.display.display(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment