Last active
June 3, 2025 21:52
-
-
Save patcon/c58e72380f259442336770cac3c2e235 to your computer and use it in GitHub Desktop.
polis-data-difference-between-csv-and-api.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyO11rrJsw+DGZPVjPGnDhNB", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/patcon/c58e72380f259442336770cac3c2e235/polis-data-difference-between-csv-and-api.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%pip install --quiet --no-cache-dir git+https://github.com/polis-community/red-dwarf.git@2025-06-02-patcon" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "_D0y7bhAg5zt", | |
| "outputId": "1b215ada-2ee9-4a0b-f161-321ade3cea81" | |
| }, | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| " Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", | |
| " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", | |
| " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.1/116.1 kB\u001b[0m \u001b[31m11.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m161.7/161.7 kB\u001b[0m \u001b[31m95.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.4/61.4 kB\u001b[0m \u001b[31m127.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.2/69.2 kB\u001b[0m \u001b[31m182.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h Building wheel for red-dwarf (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "from reddwarf.data_loader import Loader\n", | |
| "from reddwarf.data_presenter import generate_figure_polis, print_selected_statements\n", | |
| "from reddwarf.implementations.polis import run_clustering\n", | |
| "from reddwarf.utils.polismath import get_corrected_centroid_guesses\n", | |
| "from reddwarf.utils.statements import process_statements\n", | |
| "\n", | |
| "# So far, this is the only conversation for which this happens. There are likely others.\n", | |
| "REPORT_ID = \"r6ipxzfudddppwesbmtmn\" # initial bug. works now!\n", | |
| "# REPORT_ID = \"r8jhyfp54cyanhu26cz3v\" # works now!\n", | |
| "# REPORT_ID = \"r2dtdbwbsrzu8bj8wmpmc\" # works! with force k\n", | |
| "# REPORT_ID = \"r7dr5tzke7pbpbajynkv8\" # very high is_meta count. works! if unflip X on guesses, and force k to keep mono-cluster.\n", | |
| "print(f\"For this conversation: https://pol.is/report/{REPORT_ID}\")\n", | |
| "\n", | |
| "def run_algo_from_data_source(data_source: str):\n", | |
| " loader = Loader(polis_id=REPORT_ID, data_source=data_source)\n", | |
| " loader.load_api_data_report()\n", | |
| " loader.conversation_id = loader.report_data[\"conversation_id\"]\n", | |
| " loader.load_api_data_math()\n", | |
| "\n", | |
| " init_cluster_center_guesses = get_corrected_centroid_guesses(\n", | |
| " loader.math_data,\n", | |
| " flip_x=True,\n", | |
| " flip_y=True,\n", | |
| " )\n", | |
| "\n", | |
| " _, _, mod_out_statement_ids, meta_statement_ids = process_statements(\n", | |
| " statement_data=loader.comments_data,\n", | |
| " )\n", | |
| "\n", | |
| " result = run_clustering(\n", | |
| " votes=loader.votes_data,\n", | |
| " mod_out_statement_ids=mod_out_statement_ids,\n", | |
| " meta_statement_ids=meta_statement_ids,\n", | |
| " # If k is ready to change and is being held back by k-smoothing, might need to force k\n", | |
| " # force_group_count=len(init_cluster_center_guesses),\n", | |
| " keep_participant_ids=loader.math_data[\"in-conv\"],\n", | |
| " init_centers=init_cluster_center_guesses,\n", | |
| " )\n", | |
| "\n", | |
| " return loader, result" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "__bTDkHigtkK", | |
| "outputId": "3dec7c71-5942-481b-8e02-f4f051119b36" | |
| }, | |
| "execution_count": 74, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "For this conversation: https://pol.is/report/r6ipxzfudddppwesbmtmn\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# From API data\n", | |
| "\n", | |
| "This should look exactly like the participation view and the report. If they do, selected statements will also be identical." | |
| ], | |
| "metadata": { | |
| "id": "I0U8f2LzgnWo" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 75, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 433 | |
| }, | |
| "id": "lpq6PfCQgOw1", | |
| "outputId": "d2a5cd4a-9beb-4e27-9cca-34ca1d0f664a" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Calculating convex hulls around clusters...\n", | |
| "Hull 0, bounding 11 points\n", | |
| "Hull 1, bounding 10 points\n", | |
| "Hull 2, bounding 6 points\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<Figure size 560x400 with 2 Axes>" | |
| ], | |
| "image/png": "\n" | |
| }, | |
| "metadata": {} | |
| } | |
| ], | |
| "source": [ | |
| "loader_api, result_api = run_algo_from_data_source(data_source=\"api\")\n", | |
| "generate_figure_polis(result=result_api, show_guesses=True, flip_x=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "print_selected_statements(\n", | |
| " result=result_api,\n", | |
| " statements_data=loader_api.comments_data,\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "4IHZGGFtgTgw", | |
| "outputId": "e47f1312-22e2-424d-d9c9-13514f11f2e7" | |
| }, | |
| "execution_count": 76, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "# CONSENSUS STATEMENTS\n", | |
| "\n", | |
| "## FOR AGREEMENT\n", | |
| "\n", | |
| "* Systematické kurikulárne vzdelávanie k demokracii, diskusii a k ich prínosom a výzvam.\n", | |
| " 96% of everyone who voted on statement 27 agreed.\n", | |
| "\n", | |
| "* Právo prejavu je garantované ústavou.\n", | |
| " 96% of everyone who voted on statement 19 agreed.\n", | |
| "\n", | |
| "* Zbaviť sa pocitu vlastnej neomylnosti, naučiť sa počúvať a formulovať argumenty nedefenzívnym spôsobom\n", | |
| " 96% of everyone who voted on statement 30 agreed.\n", | |
| "\n", | |
| "* Ak platí sloboda prejavu pre jednotlivca, musí platiť aj pre skupiny a organizácie.\n", | |
| " 92% of everyone who voted on statement 17 agreed.\n", | |
| "\n", | |
| "* Formulovať zrozumiteľné posolstvá\n", | |
| " 91% of everyone who voted on statement 47 agreed.\n", | |
| "\n", | |
| "## FOR DISAGREEMENT\n", | |
| "\n", | |
| "None.\n", | |
| "\n", | |
| "\n", | |
| "# GROUP-REPRESENTATIVE STATEMENTS\n", | |
| "\n", | |
| "## GROUP A\n", | |
| "\n", | |
| "* Ak sme súčasťou ver. života, máme právo sa vyjadrovať k ver. politikám, k využitiu ver. zdrojov a k demokracii. V tom je politikum.\n", | |
| " 100% of those in group A who voted on statement 52 agreed.\n", | |
| "\n", | |
| "* nedôveru spôsobuje agresívna komunikácia niektorých mimovládok\n", | |
| " 90% of those in group A who voted on statement 34 disagreed.\n", | |
| "\n", | |
| "* mimovládky si často konkurujú a spôsobuje to nedôveru\n", | |
| " 81% of those in group A who voted on statement 22 disagreed.\n", | |
| "\n", | |
| "* Mám pocit, že mimovládky sú neraz závislé od vôle sponzorov-\n", | |
| " 72% of those in group A who voted on statement 7 disagreed.\n", | |
| "\n", | |
| "\n", | |
| "## GROUP B\n", | |
| "\n", | |
| "* Nezávislosť je mýtus.\n", | |
| " 66% of those in group B who voted on statement 15 agreed.\n", | |
| "\n", | |
| "* nedôveru spôsobuje agresívna komunikácia niektorých mimovládok\n", | |
| " 70% of those in group B who voted on statement 34 agreed.\n", | |
| "\n", | |
| "* mno slabo propagujú svoju činnosť\n", | |
| " 80% of those in group B who voted on statement 14 agreed.\n", | |
| "\n", | |
| "* často nie je jasné, za akým účelom mno vznikla a čo robí\n", | |
| " 100% of those in group B who voted on statement 16 agreed.\n", | |
| "\n", | |
| "* mimovládky si často konkurujú a spôsobuje to nedôveru\n", | |
| " 75% of those in group B who voted on statement 22 agreed.\n", | |
| "\n", | |
| "\n", | |
| "## GROUP C\n", | |
| "\n", | |
| "* Priradiť váhu kritérií pri vyjadrovaní sa.\n", | |
| " 100% of those in group C who voted on statement 40 agreed.\n", | |
| "\n", | |
| "* Nezávisloť je primárny predpoklad, nedokazuje sa.\n", | |
| " 100% of those in group C who voted on statement 39 agreed.\n", | |
| "\n", | |
| "* MNO zo zákona nemozu vyvíjať politickú činnosť\n", | |
| " 100% of those in group C who voted on statement 12 agreed.\n", | |
| "\n", | |
| "* Nezávislosť je to, že nepotrebujem dónora\n", | |
| " 80% of those in group C who voted on statement 28 agreed.\n", | |
| "\n", | |
| "* Na negatívnom vnímaní mimovládok majú aj ony svoj podiel viny.\n", | |
| " 100% of those in group C who voted on statement 5 agreed.\n", | |
| "\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "source": [ | |
| "# From CSV data\n", | |
| "\n", | |
| "This looks different for some reason.\n", | |
| "\n", | |
| "The statements selected should be identical, but they might change if kmeans finds diff group membership as the projections shift." | |
| ], | |
| "metadata": { | |
| "id": "LkKKuWCxhGzS" | |
| } | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "loader_csv, result_csv = run_algo_from_data_source(data_source=\"csv_export\")\n", | |
| "generate_figure_polis(result=result_csv, show_guesses=True, flip_x=True)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 489 | |
| }, | |
| "id": "uLoErunPhItH", | |
| "outputId": "cdd7ded4-2ee4-46d1-cc9a-9b96b0072095" | |
| }, | |
| "execution_count": 77, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "/usr/local/lib/python3.11/dist-packages/reddwarf/data_loader.py:130: UserWarning: CSV import is missing is_meta field. Attempting to load comments data from API instead...\n", | |
| " warnings.warn(\"CSV import is missing is_meta field. Attempting to load comments data from API instead...\")\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Calculating convex hulls around clusters...\n", | |
| "Hull 0, bounding 11 points\n", | |
| "Hull 1, bounding 10 points\n", | |
| "Hull 2, bounding 6 points\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/plain": [ | |
| "<Figure size 560x400 with 2 Axes>" | |
| ], | |
| "image/png": "\n" | |
| }, | |
| "metadata": {} | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "print_selected_statements(\n", | |
| " result=result_csv,\n", | |
| " statements_data=loader_csv.comments_data,\n", | |
| ")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "ngdkb70YiOJK", | |
| "outputId": "21b669bd-6ca2-4e90-a564-c0122712dbc9" | |
| }, | |
| "execution_count": 78, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "# CONSENSUS STATEMENTS\n", | |
| "\n", | |
| "## FOR AGREEMENT\n", | |
| "\n", | |
| "* Systematické kurikulárne vzdelávanie k demokracii, diskusii a k ich prínosom a výzvam.\n", | |
| " 96% of everyone who voted on statement 27 agreed.\n", | |
| "\n", | |
| "* Právo prejavu je garantované ústavou.\n", | |
| " 96% of everyone who voted on statement 19 agreed.\n", | |
| "\n", | |
| "* Zbaviť sa pocitu vlastnej neomylnosti, naučiť sa počúvať a formulovať argumenty nedefenzívnym spôsobom\n", | |
| " 96% of everyone who voted on statement 30 agreed.\n", | |
| "\n", | |
| "* Ak platí sloboda prejavu pre jednotlivca, musí platiť aj pre skupiny a organizácie.\n", | |
| " 92% of everyone who voted on statement 17 agreed.\n", | |
| "\n", | |
| "* Formulovať zrozumiteľné posolstvá\n", | |
| " 91% of everyone who voted on statement 47 agreed.\n", | |
| "\n", | |
| "## FOR DISAGREEMENT\n", | |
| "\n", | |
| "None.\n", | |
| "\n", | |
| "\n", | |
| "# GROUP-REPRESENTATIVE STATEMENTS\n", | |
| "\n", | |
| "## GROUP A\n", | |
| "\n", | |
| "* Ak sme súčasťou ver. života, máme právo sa vyjadrovať k ver. politikám, k využitiu ver. zdrojov a k demokracii. V tom je politikum.\n", | |
| " 100% of those in group A who voted on statement 52 agreed.\n", | |
| "\n", | |
| "* nedôveru spôsobuje agresívna komunikácia niektorých mimovládok\n", | |
| " 90% of those in group A who voted on statement 34 disagreed.\n", | |
| "\n", | |
| "* mimovládky si často konkurujú a spôsobuje to nedôveru\n", | |
| " 81% of those in group A who voted on statement 22 disagreed.\n", | |
| "\n", | |
| "* Mám pocit, že mimovládky sú neraz závislé od vôle sponzorov-\n", | |
| " 72% of those in group A who voted on statement 7 disagreed.\n", | |
| "\n", | |
| "\n", | |
| "## GROUP B\n", | |
| "\n", | |
| "* Nezávislosť je mýtus.\n", | |
| " 66% of those in group B who voted on statement 15 agreed.\n", | |
| "\n", | |
| "* nedôveru spôsobuje agresívna komunikácia niektorých mimovládok\n", | |
| " 70% of those in group B who voted on statement 34 agreed.\n", | |
| "\n", | |
| "* mno slabo propagujú svoju činnosť\n", | |
| " 80% of those in group B who voted on statement 14 agreed.\n", | |
| "\n", | |
| "* často nie je jasné, za akým účelom mno vznikla a čo robí\n", | |
| " 100% of those in group B who voted on statement 16 agreed.\n", | |
| "\n", | |
| "* mimovládky si často konkurujú a spôsobuje to nedôveru\n", | |
| " 75% of those in group B who voted on statement 22 agreed.\n", | |
| "\n", | |
| "\n", | |
| "## GROUP C\n", | |
| "\n", | |
| "* Priradiť váhu kritérií pri vyjadrovaní sa.\n", | |
| " 100% of those in group C who voted on statement 40 agreed.\n", | |
| "\n", | |
| "* Nezávisloť je primárny predpoklad, nedokazuje sa.\n", | |
| " 100% of those in group C who voted on statement 39 agreed.\n", | |
| "\n", | |
| "* MNO zo zákona nemozu vyvíjať politickú činnosť\n", | |
| " 100% of those in group C who voted on statement 12 agreed.\n", | |
| "\n", | |
| "* Nezávislosť je to, že nepotrebujem dónora\n", | |
| " 80% of those in group C who voted on statement 28 agreed.\n", | |
| "\n", | |
| "* Na negatívnom vnímaní mimovládok majú aj ony svoj podiel viny.\n", | |
| " 100% of those in group C who voted on statement 5 agreed.\n", | |
| "\n", | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Compare the loaders\n", | |
| "\n", | |
| "# Same number of votes\n", | |
| "assert len(loader_api.votes_data) == len(loader_csv.votes_data)\n", | |
| "\n", | |
| "# Same number of non-zero/non-missing votes again\n", | |
| "assert result_api.raw_vote_matrix.astype(bool).sum().sum() == result_csv.raw_vote_matrix.astype(bool).sum().sum()\n", | |
| "\n", | |
| "# Not true at all\n", | |
| "# assert result_api.filtered_vote_matrix.astype(bool).sum().sum() == result_csv.filtered_vote_matrix.astype(bool).sum().sum()\n", | |
| "\n", | |
| "# Very different. Why?\n", | |
| "print(f\"{result_api.filtered_vote_matrix.astype(bool).sum().sum()=}\")\n", | |
| "print(f\"{result_csv.filtered_vote_matrix.astype(bool).sum().sum()=}\")\n", | |
| "print()\n", | |
| "\n", | |
| "# Count of non-zero entries per statement.\n", | |
| "print(f\"{result_api.filtered_vote_matrix.astype(bool).sum(axis=0).tolist()=}\")\n", | |
| "print(f\"{result_csv.filtered_vote_matrix.astype(bool).sum(axis=0).tolist()=}\")\n", | |
| "print()\n", | |
| "\n", | |
| "_, _, mod_out_statement_ids_api, _ = process_statements(statement_data=loader_api.comments_data)\n", | |
| "_, _, mod_out_statement_ids_csv, _ = process_statements(statement_data=loader_csv.comments_data)\n", | |
| "print(f\"{mod_out_statement_ids_api=}\")\n", | |
| "print(f\"{mod_out_statement_ids_csv=}\")\n", | |
| "print()\n", | |
| "\n", | |
| "print(f\"{loader_api.comments_data[0]['is_meta']=}\")\n", | |
| "print(f\"{loader_csv.comments_data[0]['is_meta']=}\")\n", | |
| "\n", | |
| "# Seems that is_meta isn't being set when imported from CSV, likely due to column being called is-meta now." | |
| ], | |
| "metadata": { | |
| "id": "HN2Bs7iEvRlW", | |
| "outputId": "1e9862ba-5a85-4e62-dc18-d9ba4a283b1c", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| } | |
| }, | |
| "execution_count": 79, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "result_api.filtered_vote_matrix.astype(bool).sum().sum()=np.int64(1176)\n", | |
| "result_csv.filtered_vote_matrix.astype(bool).sum().sum()=np.int64(1176)\n", | |
| "\n", | |
| "result_api.filtered_vote_matrix.astype(bool).sum(axis=0).tolist()=[0, 0, 0, 0, 22, 23, 20, 20, 19, 24, 23, 16, 19, 22, 22, 19, 24, 25, 21, 26, 23, 20, 25, 17, 24, 19, 23, 26, 22, 24, 27, 22, 23, 23, 24, 20, 23, 23, 19, 18, 15, 21, 21, 22, 22, 22, 21, 25, 24, 25, 25, 22, 25, 17, 26, 26, 27]\n", | |
| "result_csv.filtered_vote_matrix.astype(bool).sum(axis=0).tolist()=[0, 0, 0, 0, 22, 23, 20, 20, 19, 24, 23, 16, 19, 22, 22, 19, 24, 25, 21, 26, 23, 20, 25, 17, 24, 19, 23, 26, 22, 24, 27, 22, 23, 23, 24, 20, 23, 23, 19, 18, 15, 21, 21, 22, 22, 22, 21, 25, 24, 25, 25, 22, 25, 17, 26, 26, 27]\n", | |
| "\n", | |
| "mod_out_statement_ids_api=[0, 1, 2, 3]\n", | |
| "mod_out_statement_ids_csv=[0, 1, 2, 3]\n", | |
| "\n", | |
| "loader_api.comments_data[0]['is_meta']=True\n", | |
| "loader_csv.comments_data[0]['is_meta']=True\n" | |
| ] | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment