Last active
April 1, 2018 20:57
-
-
Save rg3915/3c30fc46f73dfd757b12a00f29d90d85 to your computer and use it in GitHub Desktop.
PySE18: Usando Pandas e subprocess para separar dados de um CSV
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Usando Pandas e subprocess para separar dados de um CSV" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Objetivo: Ler os dados de um CSV e separar os dados por empresa em CSVs separados" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import csv\n", | |
"import names\n", | |
"import pandas as pd\n", | |
"import subprocess" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Gerando um CSV\n", | |
"with open('/tmp/names.csv', 'w') as f:\n", | |
" csv_writer = csv.writer(f)\n", | |
" csv_writer.writerow(('nome', 'email', 'empresa'))\n", | |
" for _ in range(1000):\n", | |
" first_name = names.get_first_name()\n", | |
" last_name = names.get_last_name()\n", | |
" full_name = '{} {}'.format(first_name, last_name)\n", | |
" email = '{}.{}@email.com'.format(first_name.lower(), last_name.lower())\n", | |
" company = names.get_last_name()\n", | |
" csv_writer.writerow((full_name, email, company))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Lendo o CSV\n", | |
"filename = '/tmp/names.csv'" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Criando um DataFrame\n", | |
"df = pd.read_csv(filename)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr style=\"text-align: right;\">\n", | |
" <th></th>\n", | |
" <th>nome</th>\n", | |
" <th>email</th>\n", | |
" <th>empresa</th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>2</th>\n", | |
" <td>Mabel Brund</td>\n", | |
" <td>[email protected]</td>\n", | |
" <td>Barrett</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>0</th>\n", | |
" <td>Diann Driskell</td>\n", | |
" <td>[email protected]</td>\n", | |
" <td>Brown</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>3</th>\n", | |
" <td>Donna Ivy</td>\n", | |
" <td>[email protected]</td>\n", | |
" <td>Cheatom</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>4</th>\n", | |
" <td>Sandy Sutton</td>\n", | |
" <td>[email protected]</td>\n", | |
" <td>Shannon</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>1</th>\n", | |
" <td>Patricia Watkins</td>\n", | |
" <td>[email protected]</td>\n", | |
" <td>Yarbrough</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" nome email empresa\n", | |
"2 Mabel Brund [email protected] Barrett\n", | |
"0 Diann Driskell [email protected] Brown\n", | |
"3 Donna Ivy [email protected] Cheatom\n", | |
"4 Sandy Sutton [email protected] Shannon\n", | |
"1 Patricia Watkins [email protected] Yarbrough" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Mostrando os primeiros registros\n", | |
"df.head().sort_values(by=['empresa'])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<div>\n", | |
"<style scoped>\n", | |
" .dataframe tbody tr th:only-of-type {\n", | |
" vertical-align: middle;\n", | |
" }\n", | |
"\n", | |
" .dataframe tbody tr th {\n", | |
" vertical-align: top;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead tr th {\n", | |
" text-align: left;\n", | |
" }\n", | |
"\n", | |
" .dataframe thead tr:last-of-type th {\n", | |
" text-align: right;\n", | |
" }\n", | |
"</style>\n", | |
"<table border=\"1\" class=\"dataframe\">\n", | |
" <thead>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>nome</th>\n", | |
" <th>email</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th></th>\n", | |
" <th>count</th>\n", | |
" <th>count</th>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>empresa</th>\n", | |
" <th></th>\n", | |
" <th></th>\n", | |
" </tr>\n", | |
" </thead>\n", | |
" <tbody>\n", | |
" <tr>\n", | |
" <th>Abrego</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Acosta</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Adams</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Akins</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" <tr>\n", | |
" <th>Alarcon</th>\n", | |
" <td>1</td>\n", | |
" <td>1</td>\n", | |
" </tr>\n", | |
" </tbody>\n", | |
"</table>\n", | |
"</div>" | |
], | |
"text/plain": [ | |
" nome email\n", | |
" count count\n", | |
"empresa \n", | |
"Abrego 1 1\n", | |
"Acosta 1 1\n", | |
"Adams 1 1\n", | |
"Akins 1 1\n", | |
"Alarcon 1 1" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Contando quantos dados nós temos por empresa\n", | |
"df.groupby(['empresa']).agg(['count']).head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"Johnson 13\n", | |
"Brown 12\n", | |
"Davis 12\n", | |
"Smith 12\n", | |
"Williams 6\n", | |
"Name: empresa, dtype: int64" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# Uma contagem mais simples, quantas vezes a empresa se repete?\n", | |
"df['empresa'].value_counts().head()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Lendo os dados do DataFrame\n", | |
"for i, row in df.iterrows():\n", | |
" # Atribuição múltipla\n", | |
" nome, email, empresa = row['nome'], row['email'], row['empresa']\n", | |
" # Rodando um comando do shell script\n", | |
" subprocess.call(\"mkdir -p /tmp/subpastas/{}\".format(empresa), shell=True)\n", | |
" # Nome dos arquivos de saida /tmp/subpastas/empresa/empresa.txt'\n", | |
" filename = '/tmp/subpastas/{0}/{0}.txt'.format(empresa)\n", | |
" # Abrindo o arquivo para escrever os dados\n", | |
" with open(filename, 'a') as f:\n", | |
" texto = '{},{},{}\\n'.format(nome, email, empresa)\n", | |
" # Escrevendo os dados no arquivo\n", | |
" f.write(texto)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"for i, row in df.iterrows():\n", | |
" nome, email, empresa = row['nome'], row['email'], row['empresa']\n", | |
" pasta = '/tmp/subpastas2'\n", | |
" subprocess.call(\"mkdir -p {}/{}\".format(pasta, empresa), shell=True)\n", | |
" filename = '{0}/{1}/{1}.txt'.format(pasta, empresa)\n", | |
" texto = '{},{},{}'.format(nome, email, empresa)\n", | |
" # Salvando direto no arquivo com shell script\n", | |
" subprocess.call(\"echo '{1}' >> {0}\".format(filename, texto), shell=True)" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.2" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment