Created
January 7, 2019 19:36
-
-
Save LucasMorettoRodrigues/11007aa56047a67e2c03bb30d28ee936 to your computer and use it in GitHub Desktop.
preprocessing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# REMOVENDO DADOS DUPLICADOS | |
def removeDuplicados(df): | |
df.drop_duplicates(inplace=True) | |
return df | |
all_data = removeDuplicados(all_data) | |
# FILTRANDO DADOS DE S.PAULO | |
def filtraSP(df): | |
df.CIDADE.fillna('S.PAULO') | |
df = df[df['CIDADE'] == 'S.PAULO'] | |
del df['CIDADE'] | |
return df | |
all_data = filtraSP(all_data) | |
# TRANSFORMANDO STRINGS PARA LOWER CASE | |
def lowerCase(df, cols): | |
for col in cols: | |
df[col] = df[col].str.title() | |
return df | |
colsToLower = ["PERIDOOCORRENCIA", "LOGRADOURO", "BAIRRO", "DESCR_TIPO_VEICULO"] | |
all_data = lowerCase(all_data, colsToLower) | |
# PADRONIZANDO STRINGS | |
def correct_string(string): | |
string = string.replace('Avenida ', 'Av. ').replace('Av ', 'Av. ').replace('Rua ', 'R. ').replace('R ', 'R. ') \ | |
.replace('R ', 'R. ').replace(',', '.').replace('Praça ', 'Pr. ').replace('Pr ', 'Pr. ').replace('Pública', 'pública') | |
return string | |
all_data.DESCRICAOLOCAL = all_data.astype(str).DESCRICAOLOCAL.apply(correct_string) | |
all_data.LOGRADOURO = all_data.astype(str).LOGRADOURO.apply(correct_string) | |
all_data.LATITUDE = all_data.LATITUDE.astype(str).apply(correct_string).astype(float) | |
all_data.LONGITUDE = all_data.LONGITUDE.astype(str).apply(correct_string).astype(float) | |
# TRANSFORMANDO DADOS TEMPORAIS | |
def tratarDatas(df, col = 'DATAOCORRENCIA'): | |
df[col] = pd.to_datetime(df[col], format='%d/%m/%Y', errors='coerce') | |
indexs = [] | |
for row in df[col].items(): | |
if row[1] < pd.datetime(2010, 1, 1): | |
indexs.append(row[0]) | |
df.drop(index=indexs, axis=1, inplace=True) | |
df['diasemana'] = df[col].dt.weekday_name | |
df['mesano'] = df[col].dt.strftime('%Y-%m') | |
df['mes'] = df[col].dt.month | |
df.drop(index=df[df['mesano'] == 'NaT'].index, inplace=True, axis=0) | |
return df | |
all_data = tratarDatas(all_data) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment