Last active
January 17, 2016 19:16
-
-
Save vitillo/f9934aebfa6d10e81221 to your computer and use it in GitHub Desktop.
Longitudinal dataset
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"import ujson as json\n", | |
"import matplotlib.pyplot as plt\n", | |
"import pandas as pd\n", | |
"import numpy as np\n", | |
"\n", | |
"from __future__ import division\n", | |
"\n", | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"dataset = sqlContext.read.load(\"s3://telemetry-parquet/longitudinal/generationDate=20160115/\", \"parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[1.452681541917e+18,\n", | |
" 1.452697109909e+18,\n", | |
" 1.45270390062e+18,\n", | |
" 1.4527093524939999e+18,\n", | |
" 1.452711714352e+18,\n", | |
" 1.4527228511e+18,\n", | |
" 1.4528533089189998e+18,\n", | |
" 1.4528586874890002e+18,\n", | |
" 1.45285928812e+18,\n", | |
" 1.4528702633410002e+18]" | |
] | |
}, | |
"execution_count": 11, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"dataset.first().creationTimestamp[-10:]" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"How many clients are there?" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 4 ms, sys: 8 ms, total: 12 ms\n", | |
"Wall time: 42.6 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"2952150" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time dataset.count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"OS distribution" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 28 ms, sys: 28 ms, total: 56 ms\n", | |
"Wall time: 54.9 s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"[Row(os=u'Windows_95', count=73),\n", | |
" Row(os=u'Windows_98', count=30),\n", | |
" Row(os=u'Linux', count=5612),\n", | |
" Row(os=u'Darwin', count=169001),\n", | |
" Row(os=u'Windows_NT', count=2777434)]" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time dataset.select(\"os\").groupBy(\"os\").count().collect()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Distribution of the number of fragments per client:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 244 ms, sys: 132 ms, total: 376 ms\n", | |
"Wall time: 53.4 s\n" | |
] | |
} | |
], | |
"source": [ | |
"%time histogram = dataset.select(\"creationTimestamp\").map(lambda x: len(x.creationTimestamp)).histogram(range(0, 500, 10))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"<matplotlib.axes._subplots.AxesSubplot at 0x7fa2d52980d0>" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
}, | |
{ | |
"data": { | |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAA2IAAAHnCAYAAADaTe7QAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAHxBJREFUeJzt3X+QdXddH/D3JzygBCwhBpIIsWGsCJbUYCu0FZtFEULr\nYBgqrY5ORIc64wj4Y0bAWvOgVSmK0k4rMy2BiShICoKJI5Bgs4W2En40gQSCqBAgSB7LABGqMwX5\n9o9znmSfzT67Z++e/e7du6/XzJnn7r3nvu93957n3vO+58et1loAAADo54yDHgAAAMBRo4gBAAB0\npogBAAB0pogBAAB0pogBAAB0pogBAAB0tm0Rq6qvrKobq+rmqrq1qo6P159dVddX1Yer6rqqOqvL\naAEAAFZA7fQ9YlV1Zmvtr6rqWJL/keR5SZ6R5NOttZdU1fOTPLi19oL9Hy4AAMDht+Ouia21vxov\n3i/JfZO0JE9LctV4/VVJLtuX0QEAAKygHYtYVZ1RVTcnOZHkutbau5Kc21o7Mc5yIsm5+zhGAACA\nlXJspxlaa19OcnFVPSjJG6vqMZtub1V1r/0bt7oOAADgKGmt1VbXTz5rYmvtriQ3JHlKkhNVdV6S\nVNX5Sf7iNPfZdrriiit2nEfG4cxYprHIkHEYMpZpLDJkHIaMZRqLDBmHIWOZxnKUMraz01kTzzl5\nRsSqun+S70xyW5Jrklw+znZ5kjdNLXQAAABH3U67Jp6f5Kqquk+G0va61tofVNU7k1xdVT+c5PYk\nz9zfYQIAAKyObYtYa+2WJN+8xfWfSfKkvT742traXiNkLGnGXDkyZByVjLlyZMg4Khlz5ciQcVQy\n5sqRMV/Gjt8jtnBwVduvbAAAgGVXVWl7PVkHAAAA81DEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPE\nAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAA\nOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPE\nAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAA\nOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPE\nAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOlPEAAAAOjvW88Gq\nasd5WmsdRgIAAHBwuhaxwXZFa+eiBgAAcNjZNREAAKAzRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKAz\nRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKCzbYtYVV1QVTdU1Qeq6taqeu54/fGq\nuqOqbhqnS/sMFwAA4PCr1trpb6w6L8l5rbWbq+qBSd6b5LIkz0zy+dbar21z37Y5u6qSnP7xksp2\n4wEAADgsqiqttdrqtmPb3bG1dmeSO8fLX6iq25I87GTurKMEAAA4IiYfI1ZVFyZ5bJJ3jlc9p6re\nV1VXVtVZ+zA2AACAlbTtFrGTxt0SX5/keeOWsZcn+fnx5l9I8tIkP7z5fsePH7/78tra2h6HCgAA\nsLzW19ezvr4+ad5tjxFLkqq6b5LfT/Lm1trLtrj9wiTXttYu2nS9Y8QAAIAja7tjxHY6a2IluTLJ\nBzeWsKo6f8NsT09yyxwDBQAAOAp2OmviE5K8Pcn7c8+mrJ9J8r1JLh6v+2iSH2mtndh0X1vEAACA\nI2u7LWI77pq4hwdVxAAAgCNr4V0TAQAAmJ8iBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0Jki\nBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA\n0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0Jki\nBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA\n0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0Jki\nBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA\n0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0JkiBgAA0Jki\nBgAA0JkiBgAA0Nm2RayqLqiqG6rqA1V1a1U9d7z+7Kq6vqo+XFXXVdVZfYYLAABw+FVr7fQ3Vp2X\n5LzW2s1V9cAk701yWZJnJfl0a+0lVfX8JA9urb1g033b5uyqSnL6x0sq240HAADgsKiqtNZqq9u2\n3SLWWruztXbzePkLSW5L8rAkT0ty1TjbVRnKGQAAABNMPkasqi5M8tgkNyY5t7V2YrzpRJJzZx8Z\nAADAijo2ZaZxt8Q3JHlea+3zwy6Gg9Zaq6ot9yc8fvz43ZfX1tb2Mk4AAICltr6+nvX19UnzbnuM\nWJJU1X2T/H6SN7fWXjZe96Eka621O6vq/CQ3tNYetel+jhEDAACOrIWPEauhOV2Z5IMnS9jomiSX\nj5cvT/KmOQYKAABwFOx01sQnJHl7kvfnnk1ZL0zyriRXJ/naJLcneWZr7XOb7muLGAAAcGRtt0Vs\nx10T9/CgihgAAHBkLbxrIgAAAPNTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpT\nxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAA\nADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpT\nxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAA\nADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpT\nxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAA\nADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpTxAAAADpT\nxAAAADrbsYhV1Sur6kRV3bLhuuNVdUdV3TROl+7vMAEAAFbHlC1ir0qyuWi1JL/WWnvsOL1l/qEB\nAACsph2LWGvtHUk+u8VNNf9wAAAAVt9ejhF7TlW9r6qurKqzZhsRAADAilu0iL08ySOSXJzkU0le\nOtuIAAAAVtyxRe7UWvuLk5er6hVJrt1qvuPHj999eW1tbZGHAgAAOBTW19ezvr4+ad5qre08U9WF\nSa5trV00/nx+a+1T4+WfSPItrbXv23Sftjm7qjKc5+O0j5Qp4wEAAFh2VZXW2pbn1thxi1hVvTbJ\nJUnOqapPJLkiyVpVXZyhVX00yY/MOF4AAICVNmmL2ELBtogBAABH2HZbxPZy1kQAAAAWoIgBAAB0\npogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogB\nAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0\npogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogB\nAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0pogBAAB0duygB7Bb\nVbXjPK21DiMBAABYzKErYoPtitbORQ0AAOAg2TURAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACg\nM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UM\nAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACgM0UMAACg\nM0UMAACgsx2LWFW9sqpOVNUtG647u6qur6oPV9V1VXXW/g4TAABgdUzZIvaqJJduuu4FSa5vrT0y\nyR+OPwMAADDBjkWstfaOJJ/ddPXTklw1Xr4qyWUzjwsAAGBlLXqM2LmttRPj5RNJzp1pPAAAACvv\n2F4DWmutqtpWtx0/fvzuy2tra3t9KAAAgKW1vr6e9fX1SfNWa1t2qFNnqrowybWttYvGnz+UZK21\ndmdVnZ/khtbaozbdp23Orqok2z1eZafxzJEBAACw36oqrbXa6rZFd028Jsnl4+XLk7xpwRwAAIAj\nZ8ctYlX12iSXJDknw/FgP5fk95JcneRrk9ye5Jmttc9tup8tYgAAwJG13RaxSbsmLvigihgAAHBk\n7ceuiQAAACxIEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhM\nEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMA\nAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOhMEQMAAOjs2EEP4CBU1Y7ztNY6jAQAADiKjmQRG2xX\ntHYuagAAAIuyayIAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBn\nihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgA\nAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnxw56AIdVVU2ar7W2zyMBAAAO\nG0VsT3YqWdPKGgAAcLTYNREAAKAzRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKAzRQwAAKAz\nRQwAAKAzRQwAAKCzYwc9gKOsqibN11rb55EAAAA9KWIHbqeSNa2sAQAAh4ddEwEAADpTxAAAADrb\n066JVXV7kr9M8jdJvthae9wcgwIAAFhlez1GrCVZa619Zo7BAAAAHAVz7JrobBIAAAC7sNci1pK8\nrareU1XPnmNA7E5VTZoAAIDlsdddE7+1tfapqnpIkuur6kOttXecvPH48eN3z7i2trbHh+L0nAIf\nAAAO2vr6etbX1yfNW3N9WXBVXZHkC621l44/t83Zw5aZ7R6vdvzy4sOTsXPOsmQAAADzq6q01rbc\nKrLwrolVdWZVfdV4+QFJnpzklkXzAAAAjoq97Jp4bpI3jscfHUvy262162YZFQAAwAqbbdfEewXb\nNXHHnGXJAAAA5rcvuyYCAACwGEUMAACgs72evp4VMPV7xuzeCAAA81DEGPkuMgAA6MWuiQAAAJ0p\nYgAAAJ0pYgAAAJ0pYgAAAJ0pYgAAAJ0pYgAAAJ05fT2z8F1kAAAwnSLGjHwXGQAATGHXRAAAgM4U\nMQAAgM4UMQAAgM4UMQAAgM4UMQAAgM4UMQAAgM4UMQAAgM58jxhLw5dCAwBwVChiLBlfCg0AwOqz\nayIAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnihgAAEBnvkeMleJLoQEAOAwUMVaQ\nL4UGAGC52TURAACgM0UMAACgM0UMAACgM0UMAACgMyfrgC1MOfuiMy8CALAoRQxOa7ui5cyLAAAs\nzq6JAAAAnSliAAAAnSliAAAAnTlGDPaJE34AAHA6ihjsKyf8AADg3uyaCAAA0JkiBgAA0JldE2GJ\nOc4MAGA1KWKw9BxnBgCwauyaCAAA0JktYrDi7N4IALB8FDE4EuzeCACwTOyaCAAA0JktYsCO7N4I\nADAvRQyYyO6NAABzUcSALqZsVUtsWQMAjgZFDOhop5JlyxoAcDQ4WQcAAEBntogBh4bdGwGAVaGI\nAYfM3nZvVOYAgGWgiAFHkGPVAICDpYgB7JKtagDAXiliAAuxiyQAsDhFDODAKHMAcFQpYgCHmjIH\nAIeRIgZw5O395CVTCp0yBwD3UMQAmMl2RUuZA4CNFDEAlsjBlzmFEIAeFDEAVszeytwcGQohADtR\nxABgXxyOQphsX+ic0AVgf5yx6B2r6tKq+lBV/UlVPX+xlPVFH17G0mfMlSNDxlHJmCtHhozN2obp\nhk0/Ty1Pe8uoqknTYcjYyvr6+q7vI0NG74y5cmTMl7FQEauq+yT5j0kuTfKNSb63qh69+6T1RR5e\nxqHImCtHhoyjkjFXjgwZy5qxsbhdkb0Xwq1y9j9jq+L2xCc+cddlTsb2DnoFeRUz5sqRMV/GolvE\nHpfkT1trt7fWvpjkd5J898KjAAA4NOYohJtzjnbG5uL2ohe9aM+FcL8ydrvldVkyFvmbzJHB6S1a\nxB6W5BMbfr5jvA4AABZw8IVw54xFtrwuS8bybEU+qEK4LBl3Zy1ycG1VPSPJpa21Z48/f3+Sx7fW\nnrNhnt0HAwAArJDW2pbtbNGzJn4yyQUbfr4gw1axHR8QAADgqFt018T3JPn6qrqwqu6X5F8kuWa+\nYQEAAKyuhbaItda+VFU/luStSe6T5MrW2m2zjgwAAGBFLXSMGAAAAItb9BixXavhe8a+O/ecXfGO\nJNfYkgawmKo6I8PXiTwsw6mrPpnkXW0Xn7DJkHEYMpZpLDJWNwN667JFrKqen+R7M3zf2MmTelyQ\n4diy17XWfnlCxllJXpDksiTnZvhP9hdJ3pTkxa21z8non7FMY5nx91mKNwQZq5sxR05VPTnJbyT5\n09zzuvrwJF+f5Edba2+VIWMVMpZpLDJWN2PMWYr3iGV5n5GxPxmnaK3t+5TkT5Lcd4vr75fhi6Gn\nZFyX5PlJzss9BfL8DCve18k4mIxlGstMGU/O8EL+liSvGKe3JPmzJE+RIWOvGTOO5UNJLtzi+kck\n+ZAMGauSsUxjkbHSGUvxHjFHxjKNRcYOmYvcadcPcvr/IBcm+eOJGR9e5DYZ+5uxTGOZKWNZ3hBk\nrGjGjGOZ4wMuGTKWPmOZxiJjpTOW4j1ijoxlGouM7adex4j9eJK3VdWfJvnEeN0FGTYZ/9jEjI9V\n1U8nuaq1diJJquq8JJcn+biMA8tYprHMkXGfDJuZN/tkph9TKUNGj5xXJnl3Vb02p+7y/S/H22TI\nWJWMZRqLjNXNWJb3iGV6n5Exf8Ypup01saruk3vvU/me1tqXJt7/7Ay7mD0tw/E/SXIiw/eXvbi1\n9hkZs2TcmeTaqRnLNJaZxvHCDMcubvVifnVr7ZdkyNhLxsw535jhJEhfM171yQwnQfrglPvLkHFY\nMpZpLDJWM2NZ3iOW6X1GxvwZ98rsVcT2U1U9q7X2qonzPjpDGbyxtfb5Dddf2lp7y8SMxydprbV3\nVdVjkjwlyW2ttT9YYPgnM1/dWvuBPdz/2zIU3Vtaa9dNvM/jM2xKvauqHpChxHxzkg8k+cXW2l0T\nc56b5I2ttU/sOPPpM74iw4L8562166vq+5P8oyQfTPKfW2tfnJjzdUmekeEg3b9J8uEkr5n6u4wZ\nB/6GIGO1M+bM4fSq6tyTW8cPeBzntNY+fdDjYHlZVpfDsrxHLNP7jIz5M06xyP6MyzYl+cTE+Z6b\n5I8znEXvY0ku23DbTRMzjid5Z5L3JvnlJP8tyb9J8vYkPzsx49oMW2mu3TD935PXT8x414bLz05y\nc5IrkvzPJC+cmPHBJMfGy/8lycuSPGH8HX93F3//u5J8Ksk7kvxokocs8By+Jsnrxr/Bq5O8MckP\nJLkqw66GUzKel+T6JD+b5H9lOIPSLya5LckTD3o5XZUpybkHPYZxHOcc9BgO+Pc/K8mLM+yz/tkk\nnxkvvzjJWTPkv3nifA8aH/O3knzfptt+Y2LG+UlenuQ/Jfnq8TXoliRXJzl/YsbZm6avTnL7yZ8n\nZjx109/3ynEcr5m63Cf5dydfA5P8gyQfyXBw98eTrE3MuGl8Hfu6PTx/35LkhvF5uWB8bbwrybuT\nPHZixlcl+fkMH879ZZJPJ7kxyQ8uy3JqWbWszrWsmiY/X9YB5vw9DnoAu/iD37LN9P8mZtya5IHj\n5QszlKkfH3+eWsRuzbAf6JlJPp/kQeP198+wNWpKxk1JfjvJE5NckmQtQ5G5JMklUzM2XH7PhhfT\nByS5dWLGbRsu/+9Nt71vF8/NTUnOyHA2mVcm+T8ZziJzeZKvmvr8jv8ey3DK+ZMFsXbxd701yX3G\ny2cm+e/j5a9NcvPEDCu3p2ZYYTg1Y88rDGPOHCu4c5wl9JtPM/39JHdOzPjdcVl9eoYPUt6Q5CtP\n/s0nZrw1yXOSvHBcNl4w/r99TpLfm5jx5SQf3TR9cfz3I1OXkQ2Xr0zybzO8V/xEkjdNzLh1w+X1\nJN8yXn5kkvdOzPhokl8dl+93j4//NbtcVt+d5KkZvjrmjiTfk+H19DuS/NHEjGuSPGtc1n8yyc+N\nv8dvJvmlXsupZdWy2mlZXZn3/3Fe6wCnZizNOsApmXtdsHpNGY71eez4QrN5+vOJGR/Y9PMDM7yo\n/nqmr6jfvNXl8edJ5SXDwX4/meRtJ5+4JB/d5d/j/Rv+U9x0ujHukPH6JD80Xn5VTn0RfvcuxrL5\n8e+XYbPt7yT59MSMW5N8RZIHZyi4Xz1ef/9sKIw7ZNySe95Uz85wDOKWz/02GVZuT82wwnBqxp5X\nGMacOVYa5jhL6N9keFPZavrriRnv2/Tzv86wZf6cXSyrG5eRj2+Xv03GT2X4AOjvbXzOd/n8bhzH\n+06+BuxyHLdlPHtbknduum3yh3Xjv5Xkn2RYmbpzfF7+1Qx/06nvEe/f9PN7xn/PSMczHltWLaud\nltWVef8f57UOcGrG0qwDnJK5yJ0OYsqwpeXbTnPbaydm3JDk4k3X3TfDys+XJ2bcmOTM8fIZG64/\nK5u2Kk3IeniS/5rh04pJu1duuO/tG/5TfCTjpxsZPmnfzdafq8b737jhP9fbk3zTLsZy2heGJA+Y\nmPET4zg+nmEXwz/M8P0MtyY5PjHjeeOLzSsy7IJ6smQ+NMnbJ2ZYuT11PisM0/+mk/7fjfPOsdJw\nfZKfzoZPFDOsQDw/ydsmZnwgySNPc9vUXb5vy4bXwvG6HxyzPzYx430bLv/iIs/vOO8FGV5Tfz3J\n31pgWb0jQzH+qQyvsRuX1fdPzHjO+Nx8e4ZPoP99hr0dXpTk1btdzjZcdyzJpUleNTHjnRmOX35m\nhl3xnz5ef0k2fEi1Q8YfZXzfzfDh2ls33NZtObWsWlY7Lasr8/6/+bmJdYC7x5ElWAc45X6L3Omw\nTuML33lbXF9JnjAx4ytPc/05SS5acFzflYmfgk/IOjPJI3Z5nwcluTjD5t57/X0m3P8bZhr7w5I8\nbLz84AyfNDxulxmPSfLPkzxqwTFYub13jhWGe+bf8wrDOP8cKw1nJ3lJ7tmN5rPj5Zdk+i4j33O6\n/yvZcAztDhm/kuQ7t7j+0iR/MjHjF7LFbswZvuLk9btZ3jb8TW9McmKX9zue4Vjbk9NDx+vPT/Kb\nu8h5YoZjXm/K8OHQm5P8SLb4nqPT3P91u/2dt8i4OMMn/G9J8qgk/yHJ5zIcG/ytEzO+KcMnwJ/L\nsCL4DeP1D0ny3F7L6YRl9elLsKz+nSVZVs+zrN5rWX3exIyVef8f72Md4NSMpVkHOCVzkTuZTKs6\nxcrtdnnLsHJ7dRZfYfidGZaPPa8wjDl7XsEd5390kidtfp6TXLrLjO/IePzshuufepgzMnwoddGC\nGU+aYRz79bzsNmO/fpfdZDw+44dqGT4s+6kk/3Tq/TflnNwl6e8ukrOkGRdlOHZlkYzHzTCOOTLm\n+HtsHsc/6/m7ZP/f/7t9YDDOv2zrACf/3bgOMKlEjfPvdR1g7g8NHp0F1wFOydzroEymozIledYM\nGT90mDNy6srtsvw95hjHUmTs5m+Sec4CK0PGfmcczx7PNDxXjgwZu52yi6KwTcbkDxz3M2OOnAzr\nALsucfv0N1mW52ZP41iJ7xGDHqrqE621C2TI2K+M3eRU1a1J/mFr7QtVdWGGA7pf3Vp7WVXd1Fp7\n7AIZr0/yWzJkzJxxcYaTOJ1I8vA2fHfl/TN8DctFO2XMlSNDxg4Z1yZpGQ5XOenbM5S61lp72mHJ\nWKaxyNjesd3eAVZZVd2yzc3nyjjFQ1doHN1+lxlzqrX2hSRprd1eVZckeUNV/e2c+iaxm4w1GTJm\nzvhSa+1LSb5UVX/WWrtrzPvrqvryxIy5cmTI2M7DM+xi9ooMZwusDMfO/+rE+/fImPr/bq6cZfl9\nDsNzs5g2w6ZOk2lVpszzNQkyZPQYyw3Z+1lgZcjY74xZzjQ8R44MGTtkzPHVQkuRsUxjkbFD5l7u\nbDKt2pR5viZBhoweY5njLLAyZOx3xixnGp4jR4aMiVkLf7XQsmUs01hkbD05RgwAADaoqu9K8o9b\naz9z2DOWaSwyNmUoYgAAAH2dcdADAAAAOGoUMQAAgM4UMQAAgM4UMQAAgM7+P/5r3IF0VVzKAAAA\nAElFTkSuQmCC\n", | |
"text/plain": [ | |
"<matplotlib.figure.Figure at 0x7fa2fa0779d0>" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"pd.Series(100*np.array(histogram[1])/np.sum(histogram[1]), index=histogram[0][:-1]).plot(kind=\"bar\", figsize=(15, 8))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"JSON + Python = :(" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CPU times: user 364 ms, sys: 172 ms, total: 536 ms\n", | |
"Wall time: 20min 11s\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/plain": [ | |
"295397" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"%time dataset.select(\"histograms\").sample(True, 0.1).map(lambda xs: len([json.loads(x) for x in xs.histograms if x])).count()" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Total number of clients in the same time period:" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"val = sqlContext.read.load(\"s3://telemetry-parquet/executive-stream/telemetry-executive-summary-3/\", \"parquet\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"collapsed": false | |
}, | |
"outputs": [], | |
"source": [ | |
"val.filter((val.submissionDateS3 >= \"20151115\") & \n", | |
" (val.submissionDateS3 <= \"20160115\") & \n", | |
" (val.channelS3 == \"release\")).select(\"clientId\").groupBy(\"clientId\").count().count()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"metadata": { | |
"collapsed": true | |
}, | |
"outputs": [], | |
"source": [ | |
"# 274367176 - Query ran on a bigger cluster" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.10" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 0 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment