Created
May 24, 2020 10:44
-
-
Save louisguitton/cfe2218d6109b45915dcb87580322882 to your computer and use it in GitHub Desktop.
data models for https://github.com/raphodn/know-your-planet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import json\n", | |
| "import pandas as pd\n", | |
| "from sqlalchemy import create_engine" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "with open('api.questionstat - 2020-04-09.json') as fh:\n", | |
| " data = json.loads(fh.read())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>model</th>\n", | |
| " <th>pk</th>\n", | |
| " <th>fields.question</th>\n", | |
| " <th>fields.answer_choice</th>\n", | |
| " <th>fields.created</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>api.questionstat</td>\n", | |
| " <td>1730</td>\n", | |
| " <td>62</td>\n", | |
| " <td>b</td>\n", | |
| " <td>2020-04-07 18:26:31.137000+00:00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>api.questionstat</td>\n", | |
| " <td>1731</td>\n", | |
| " <td>44</td>\n", | |
| " <td>c</td>\n", | |
| " <td>2020-04-07 18:37:05.718000+00:00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>api.questionstat</td>\n", | |
| " <td>1732</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " <td>2020-04-07 18:37:18.488000+00:00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>api.questionstat</td>\n", | |
| " <td>1733</td>\n", | |
| " <td>27</td>\n", | |
| " <td>d</td>\n", | |
| " <td>2020-04-07 18:38:00.057000+00:00</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>api.questionstat</td>\n", | |
| " <td>1734</td>\n", | |
| " <td>9</td>\n", | |
| " <td>b</td>\n", | |
| " <td>2020-04-07 18:50:10.325000+00:00</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " model pk fields.question fields.answer_choice \\\n", | |
| "0 api.questionstat 1730 62 b \n", | |
| "1 api.questionstat 1731 44 c \n", | |
| "2 api.questionstat 1732 4 d \n", | |
| "3 api.questionstat 1733 27 d \n", | |
| "4 api.questionstat 1734 9 b \n", | |
| "\n", | |
| " fields.created \n", | |
| "0 2020-04-07 18:26:31.137000+00:00 \n", | |
| "1 2020-04-07 18:37:05.718000+00:00 \n", | |
| "2 2020-04-07 18:37:18.488000+00:00 \n", | |
| "3 2020-04-07 18:38:00.057000+00:00 \n", | |
| "4 2020-04-07 18:50:10.325000+00:00 " | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "raw_events__answers = pd.json_normalize(data)\n", | |
| "raw_events__answers[\"fields.created\"] = raw_events__answers[\"fields.created\"].apply(pd.to_datetime)\n", | |
| "raw_events__answers.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "engine = create_engine('sqlite:///raph.db')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "raw_events__answers.to_sql(name=\"raw_events__answers\", con=engine)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- Avoir des stats journalière (1 row par jour, JSONField). metrics: nombre de réponses totales, par heure, par page, etc\n", | |
| "- Avoir des stats par question (1 row par question, JSONField). metrics: nombre de réponses totales, par choix de réponse\n", | |
| "- Futur: Avoir des stats par quizz, like/dislike, ..." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>answer_id</th>\n", | |
| " <th>question_id</th>\n", | |
| " <th>answer_choice</th>\n", | |
| " <th>created_at</th>\n", | |
| " <th>date_at</th>\n", | |
| " <th>hour_at</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1730</td>\n", | |
| " <td>62</td>\n", | |
| " <td>b</td>\n", | |
| " <td>2020-04-07 18:26:31.137000+00:00</td>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1731</td>\n", | |
| " <td>44</td>\n", | |
| " <td>c</td>\n", | |
| " <td>2020-04-07 18:37:05.718000+00:00</td>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1732</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " <td>2020-04-07 18:37:18.488000+00:00</td>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1733</td>\n", | |
| " <td>27</td>\n", | |
| " <td>d</td>\n", | |
| " <td>2020-04-07 18:38:00.057000+00:00</td>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1734</td>\n", | |
| " <td>9</td>\n", | |
| " <td>b</td>\n", | |
| " <td>2020-04-07 18:50:10.325000+00:00</td>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " answer_id question_id answer_choice created_at \\\n", | |
| "0 1730 62 b 2020-04-07 18:26:31.137000+00:00 \n", | |
| "1 1731 44 c 2020-04-07 18:37:05.718000+00:00 \n", | |
| "2 1732 4 d 2020-04-07 18:37:18.488000+00:00 \n", | |
| "3 1733 27 d 2020-04-07 18:38:00.057000+00:00 \n", | |
| "4 1734 9 b 2020-04-07 18:50:10.325000+00:00 \n", | |
| "\n", | |
| " date_at hour_at \n", | |
| "0 2020-04-07 18 \n", | |
| "1 2020-04-07 18 \n", | |
| "2 2020-04-07 18 \n", | |
| "3 2020-04-07 18 \n", | |
| "4 2020-04-07 18 " | |
| ] | |
| }, | |
| "execution_count": 36, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "stg_events__answers = raw_events__answers.drop(columns=\"model\").rename(columns={\n", | |
| " \"pk\": \"answer_id\", \n", | |
| " \"fields.question\": \"question_id\", \n", | |
| " \"fields.answer_choice\": \"answer_choice\", \n", | |
| " \"fields.created\": \"created_at\"\n", | |
| "})\n", | |
| "stg_events__answers[\"date_at\"] = stg_events__answers.created_at.dt.date\n", | |
| "stg_events__answers[\"hour_at\"] = stg_events__answers.created_at.dt.hour\n", | |
| "stg_events__answers.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date_at</th>\n", | |
| " <th>hour_at</th>\n", | |
| " <th>question_id</th>\n", | |
| " <th>answer_choice</th>\n", | |
| " <th>answers_count</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " <td>4</td>\n", | |
| " <td>d</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " <td>9</td>\n", | |
| " <td>b</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " <td>19</td>\n", | |
| " <td>c</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " <td>27</td>\n", | |
| " <td>d</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>18</td>\n", | |
| " <td>29</td>\n", | |
| " <td>c</td>\n", | |
| " <td>1</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date_at hour_at question_id answer_choice answers_count\n", | |
| "0 2020-04-07 18 4 d 1\n", | |
| "1 2020-04-07 18 9 b 1\n", | |
| "2 2020-04-07 18 19 c 1\n", | |
| "3 2020-04-07 18 27 d 1\n", | |
| "4 2020-04-07 18 29 c 1" | |
| ] | |
| }, | |
| "execution_count": 39, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "fct_answers = stg_events__answers.groupby([\n", | |
| " stg_events__answers.date_at,\n", | |
| " stg_events__answers.hour_at,\n", | |
| " stg_events__answers.question_id,\n", | |
| " stg_events__answers.answer_choice\n", | |
| "]).answer_id.agg(answers_count=\"count\").reset_index()\n", | |
| "fct_answers.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>date_at</th>\n", | |
| " <th>answers_count</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2020-04-07</td>\n", | |
| " <td>31</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2020-04-08</td>\n", | |
| " <td>36</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " date_at answers_count\n", | |
| "0 2020-04-07 31\n", | |
| "1 2020-04-08 36" | |
| ] | |
| }, | |
| "execution_count": 41, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "viz_day_stats = fct_answers.groupby(\"date_at\").answers_count.sum().reset_index()\n", | |
| "viz_day_stats.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th>answer_choice</th>\n", | |
| " <th>a</th>\n", | |
| " <th>b</th>\n", | |
| " <th>c</th>\n", | |
| " <th>d</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>question_id</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>2.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| "answer_choice a b c d\n", | |
| "question_id \n", | |
| "1 NaN NaN 1.0 1.0\n", | |
| "4 NaN NaN NaN 2.0\n", | |
| "8 NaN NaN NaN 1.0\n", | |
| "9 NaN 1.0 NaN NaN\n", | |
| "10 NaN 1.0 NaN NaN" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "viz_question_stats = fct_answers.groupby([\"question_id\", \"answer_choice\"]).answers_count.sum().unstack()\n", | |
| "viz_question_stats.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 46, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "stg_events__answers.to_sql(name=\"stg_events__answers\", con=engine)\n", | |
| "fct_answers.to_sql(name=\"fct_answers\", con=engine)\n", | |
| "viz_day_stats.to_sql(name=\"viz_day_stats\", con=engine)\n", | |
| "viz_question_stats.to_sql(name=\"viz_question_stats\", con=engine)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.7.7" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 4 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment