Last active
May 8, 2018 20:02
-
-
Save georgf/372b442487e1752081c75314d276203d to your computer and use it in GitHub Desktop.
Current histogram & scalar payload sizes
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# In[1]: | |
import datetime as dt | |
import ujson as json | |
import pandas as pd | |
import numpy as np | |
import copy as cp | |
import matplotlib.pyplot as plt | |
from moztelemetry import get_pings, get_pings_properties, get_one_ping_per_client | |
from moztelemetry.dataset import Dataset | |
get_ipython().magic(u'matplotlib inline') | |
# # Load ping data | |
# Get a bunch of pings from the last nightly & release Firefox. | |
# In[2]: | |
dataset = Dataset.from_source('telemetry') | |
# In[3]: | |
dataset.schema | |
# In[4]: | |
records_nightly = Dataset.from_source('telemetry') .where(docType='main') .where(appUpdateChannel='nightly') .where(submissionDate=lambda x: x.startswith('201804')) .where(appVersion=lambda x: x.startswith('61.')) .records(sc, sample=0.1) | |
records_nightly.count() | |
# In[5]: | |
records_release = Dataset.from_source('telemetry') .where(docType='main') .where(appUpdateChannel='release') .where(submissionDate=lambda x: x.startswith('201804')) .where(appVersion=lambda x: x.startswith('59.')) .records(sc, sample=0.01) | |
records_release.count() | |
# # Helper functions | |
# Define the functions to extract the fields from each ping and map them to their json length in bytes. | |
# In[6]: | |
def get_from_ping(ping, path): | |
try: | |
return reduce(lambda d, k: d[k], path.split("/"), ping) | |
except (KeyError, IndexError): | |
return None | |
def extract_fields_size(ping): | |
field_list = [ | |
"payload/histograms", | |
"payload/keyedHistograms", | |
"payload/processes/content/histograms", | |
"payload/processes/content/keyedHistograms", | |
"payload/processes/parent/scalars", | |
"payload/processes/parent/keyedScalars", | |
"payload/processes/content/scalars", | |
"payload/processes/content/keyedScalars", | |
] | |
# Build a tuple (field_name, json_field_size) for each field. | |
p = cp.deepcopy(ping) | |
tuples = [(e, len(json.dumps(get_from_ping(p, e)))) for e in field_list if get_from_ping(p, e)] | |
return tuples | |
def get_payload_size(ping): | |
field_sizes = extract_fields_size(ping) | |
return sum([t[1] for t in field_sizes]) | |
# In[7]: | |
extract_fields_size(records_nightly.first()) | |
# In[8]: | |
get_payload_size(records_nightly.first()) | |
# In[9]: | |
extract_fields_size(records_release.first()) | |
# In[10]: | |
get_payload_size(records_release.first()) | |
# # Get Nightly size data | |
# In[12]: | |
sizes_nightly = pd.Series(records_nightly.map(get_payload_size).collect()) | |
# In[13]: | |
(sizes_nightly / 1024).describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) | |
# In[26]: | |
(sizes_nightly / 1024).hist() | |
plt.xlabel('Size in kb') | |
plt.ylabel('Frequency') | |
plt.title('Nightly histogram & scalar payload size') | |
# # Get Release size data | |
# In[15]: | |
sizes_release = pd.Series(records_release.map(get_payload_size).collect()) | |
# In[16]: | |
(sizes_release / 1024).describe(percentiles=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99, 0.999]) | |
# In[25]: | |
(sizes_release / 1024).hist() | |
plt.xlabel('Size in kb') | |
plt.ylabel('Frequency') | |
plt.title('Release histogram & scalar payload size') | |
# In[ ]: | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment