Last active
February 24, 2017 11:48
-
-
Save georgf/94ca77fe6174ec07077504b24379932a to your computer and use it in GitHub Desktop.
histogram-empty-key-counts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# # Find histograms with empty keys | |
# ### Find histograms with empty keys | |
# In[7]: | |
import ujson as json | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import plotly.plotly as py | |
from plotly.graph_objs import * | |
import IPython | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client | |
from pprint import pprint | |
get_ipython().magic(u'pylab inline') | |
# In[8]: | |
channels = ["nightly", "aurora", "beta", "release"] | |
submission_dates = ("20170122", "20170222") | |
fractions = { | |
"nightly": 0.1, | |
"aurora": 0.1, | |
"beta": 0.1, | |
"release": 0.003 | |
} | |
pings = {} | |
for c in channels: | |
pings[c] = get_pings(sc, | |
app="Firefox", | |
channel=c, | |
doc_type="main", | |
fraction=fractions[c], | |
submission_date=submission_dates) | |
# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings). | |
# In[9]: | |
def get_keyed_histograms(p): | |
if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict): | |
return {} | |
return p.get("payload", {}).get("keyedHistograms", {}) | |
# This extracts the keyed histograms names which have an empty key string. | |
def extract_affected_histograms(p): | |
khs = get_keyed_histograms(p) | |
names = [name for name,kh in khs.iteritems() if "" in kh] | |
return names | |
extracts = {} | |
for c,ps in pings.iteritems(): | |
extracts[c] = ps.flatMap(extract_affected_histograms) | |
# Let's get sorted lists of the hit counts per channel. | |
# In[10]: | |
nameCounts = {} | |
for channel,names in extracts.iteritems(): | |
counts = names.countByValue() | |
nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True) | |
# In[12]: | |
for channel in channels: | |
df = pd.DataFrame([x for _,x in nameCounts[channel]], | |
[x for x,_ in nameCounts[channel]]) | |
print "\n" + channel + "\n" | |
df.columns = ["# of hits in " + channel] | |
IPython.display.display(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment