Created
May 15, 2021 06:36
-
-
Save Proteusiq/b4f476547bd2ac40fa5e58cabf65126b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>comment</th>\n", | |
" <th>temperature</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>james</td>\n", | |
" <td>The world has ended XXXX</td>\n", | |
" <td>0.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>jack</td>\n", | |
" <td>24 hours to go before XXXX is gone</td>\n", | |
" <td>17.78</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name comment temperature\n", | |
"0 james The world has ended XXXX 0.00\n", | |
"1 jack 24 hours to go before XXXX is gone 17.78" | |
] | |
}, | |
"execution_count": 4, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"import io\n", | |
"import re\n", | |
"import pandas as pd\n", | |
"\n", | |
"\n", | |
"# data example\n", | |
"csv_ish = io.StringIO(\n", | |
"\"\"\"name,comment,temperature\n", | |
"james,The world has ended 007,32\n", | |
"jack,24 hours to go before CTU is gone,64\n", | |
"\"\"\"\n", | |
")\n", | |
"\n", | |
"\n", | |
"def remove_sensitive_data(text: str) -> str:\n", | |
"\t# remove sensitive func logic example\n", | |
"\tsensitive = {\"007\", \"CTU\"}\n", | |
"\tpattern = re.compile(\"|\".join(sensitive))\n", | |
"\ttext = re.sub(pattern, \"XXXX\", text)\n", | |
"\treturn text\n", | |
"\n", | |
"\n", | |
"def fahrenheit_to_celsius(temp: str) -> float:\n", | |
"\t# data transformation example\n", | |
"\ttry:\n", | |
"\t\treturn round((float(temp) - 32) * 5 / 9, 2)\n", | |
"\texcept TypeError:\n", | |
"\t\treturn None\n", | |
"\n", | |
"\n", | |
"# Usage:\n", | |
"# using converters to transform columns during data read\n", | |
"\n", | |
"pd.read_csv(\n", | |
"\tcsv_ish,\n", | |
"\tconverters={\n", | |
"\t\t'comment': remove_sensitive_data,\n", | |
"\t\t'temperature': fahrenheit_to_celsius\n", | |
"\t}\n", | |
")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>comment</th>\n", | |
" <th>temperature</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>james</td>\n", | |
" <td>The world has ended XXXX</td>\n", | |
" <td>0.00</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>jack</td>\n", | |
" <td>24 hours to go before XXXX is gone</td>\n", | |
" <td>17.78</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name comment temperature\n", | |
"0 james The world has ended XXXX 0.00\n", | |
"1 jack 24 hours to go before XXXX is gone 17.78" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"FILE_PATH = io.StringIO(\n", | |
"\"\"\"name,comment,temperature\n", | |
"james,The world has ended 007,32\n", | |
"jack,24 hours to go before CTU is gone,64\n", | |
"\"\"\"\n", | |
")\n", | |
"CHUNK_SIZE = 100_000\n", | |
"\n", | |
"reader = pd.read_csv(FILE_PATH, chunksize=CHUNK_SIZE, low_memory=False, converters={\n", | |
"\t\t'comment': remove_sensitive_data,\n", | |
"\t\t'temperature': fahrenheit_to_celsius\n", | |
"\t})\n", | |
"for index, dataf_chunk in enumerate(reader):\n", | |
" ...\n", | |
" display(dataf_chunk)\n", | |
" # do something awesome with dataf_chunk" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>name</th>\n", | |
" <th>comment</th>\n", | |
" <th>temperature</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>james</td>\n", | |
" <td>The world has ended 007</td>\n", | |
" <td>32</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>jack</td>\n", | |
" <td>24 hours to go before CTU is gone</td>\n", | |
" <td>64</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" name comment temperature\n", | |
"0 james The world has ended 007 32\n", | |
"1 jack 24 hours to go before CTU is gone 64" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"csv_ish = io.StringIO(\n", | |
"\"\"\"name,comment,temperature\n", | |
"james,The world has ended 007,32\n", | |
"jack,24 hours to go before CTU is gone,64\n", | |
"\"\"\"\n", | |
")\n", | |
"pd.read_csv(csv_ish,)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.6.6+" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment