georgf · February 24, 2017 11:48
diff --git a/histogram-empty-key-counts.ipynb b/histogram-empty-key-counts.ipynb
diff --git a/histogram-empty-key-counts.py b/histogram-empty-key-counts.py

 # coding: utf-8

 # # Find histograms with empty keys

 # ### Find histograms with empty keys

 # In[7]:

 import ujson as json
 import matplotlib.pyplot as plt
 import pandas as pd
 import numpy as np
 import plotly.plotly as py
 from plotly.graph_objs import *
 import IPython

 from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
 from pprint import pprint

 get_ipython().magic(u'pylab inline')


 # In[8]:

 channels = ["nightly", "aurora", "beta", "release"]
 submission_dates = ("20170122", "20170222")
 fractions = {
    "nightly": 0.1,
    "aurora": 0.1,
    "beta": 0.1,
    "release": 0.003
 }
 pings = {}

 for c in channels:
    pings[c] = get_pings(sc,
                      app="Firefox",
                      channel=c,
                      doc_type="main",
                      fraction=fractions[c],
                      submission_date=submission_dates)


 # ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).

 # In[9]:

 def get_keyed_histograms(p):
    if not isinstance(p, dict) or        "payload" not in p or        not isinstance(p["payload"], dict) or        "keyedHistograms" not in p["payload"] or        not isinstance(p["payload"]["keyedHistograms"], dict):
        return {}
    return p.get("payload", {}).get("keyedHistograms", {})

 # This extracts the keyed histograms names which have an empty key string.
 def extract_affected_histograms(p):
    khs = get_keyed_histograms(p)
    names = [name for name,kh in khs.iteritems() if "" in kh]
    return names

 extracts = {}

 for c,ps in pings.iteritems():
    extracts[c] = ps.flatMap(extract_affected_histograms)


 # Let's get sorted lists of the hit counts per channel.

 # In[10]:

 nameCounts = {}
 for channel,names in extracts.iteritems():
    counts = names.countByValue()
    nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)


 # In[12]:

 for channel in channels:
    df = pd.DataFrame([x for _,x in  nameCounts[channel]],
                      [x for x,_ in  nameCounts[channel]])
    print "\n" + channel + "\n"
    df.columns = ["# of hits in " + channel]
    IPython.display.display(df)

	# coding: utf-8

	# # Find histograms with empty keys

	# ### Find histograms with empty keys

	# In[7]:

	import ujson as json
	import matplotlib.pyplot as plt
	import pandas as pd
	import numpy as np
	import plotly.plotly as py
	from plotly.graph_objs import *
	import IPython

	from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client
	from pprint import pprint

	get_ipython().magic(u'pylab inline')


	# In[8]:

	channels = ["nightly", "aurora", "beta", "release"]
	submission_dates = ("20170122", "20170222")
	fractions = {
	"nightly": 0.1,
	"aurora": 0.1,
	"beta": 0.1,
	"release": 0.003
	}
	pings = {}

	for c in channels:
	pings[c] = get_pings(sc,
	app="Firefox",
	channel=c,
	doc_type="main",
	fraction=fractions[c],
	submission_date=submission_dates)


	# ... now extract the names of all keyed histograms with empty key strings (from all valid-looking pings).

	# In[9]:

	def get_keyed_histograms(p):
	if not isinstance(p, dict) or "payload" not in p or not isinstance(p["payload"], dict) or "keyedHistograms" not in p["payload"] or not isinstance(p["payload"]["keyedHistograms"], dict):
	return {}
	return p.get("payload", {}).get("keyedHistograms", {})

	# This extracts the keyed histograms names which have an empty key string.
	def extract_affected_histograms(p):
	khs = get_keyed_histograms(p)
	names = [name for name,kh in khs.iteritems() if "" in kh]
	return names

	extracts = {}

	for c,ps in pings.iteritems():
	extracts[c] = ps.flatMap(extract_affected_histograms)


	# Let's get sorted lists of the hit counts per channel.

	# In[10]:

	nameCounts = {}
	for channel,names in extracts.iteritems():
	counts = names.countByValue()
	nameCounts[channel] = sorted(counts.iteritems(), key=lambda t: t[1], reverse=True)


	# In[12]:

	for channel in channels:
	df = pd.DataFrame([x for _,x in nameCounts[channel]],
	[x for x,_ in nameCounts[channel]])
	print "\n" + channel + "\n"
	df.columns = ["# of hits in " + channel]
	IPython.display.display(df)