Skip to content

Instantly share code, notes, and snippets.

@LucasMorettoRodrigues
Created January 7, 2019 19:36
Show Gist options
  • Save LucasMorettoRodrigues/11007aa56047a67e2c03bb30d28ee936 to your computer and use it in GitHub Desktop.
Save LucasMorettoRodrigues/11007aa56047a67e2c03bb30d28ee936 to your computer and use it in GitHub Desktop.
preprocessing
# REMOVENDO DADOS DUPLICADOS
def removeDuplicados(df):
df.drop_duplicates(inplace=True)
return df
all_data = removeDuplicados(all_data)
# FILTRANDO DADOS DE S.PAULO
def filtraSP(df):
df.CIDADE.fillna('S.PAULO')
df = df[df['CIDADE'] == 'S.PAULO']
del df['CIDADE']
return df
all_data = filtraSP(all_data)
# TRANSFORMANDO STRINGS PARA LOWER CASE
def lowerCase(df, cols):
for col in cols:
df[col] = df[col].str.title()
return df
colsToLower = ["PERIDOOCORRENCIA", "LOGRADOURO", "BAIRRO", "DESCR_TIPO_VEICULO"]
all_data = lowerCase(all_data, colsToLower)
# PADRONIZANDO STRINGS
def correct_string(string):
string = string.replace('Avenida ', 'Av. ').replace('Av ', 'Av. ').replace('Rua ', 'R. ').replace('R ', 'R. ') \
.replace('R ', 'R. ').replace(',', '.').replace('Praça ', 'Pr. ').replace('Pr ', 'Pr. ').replace('Pública', 'pública')
return string
all_data.DESCRICAOLOCAL = all_data.astype(str).DESCRICAOLOCAL.apply(correct_string)
all_data.LOGRADOURO = all_data.astype(str).LOGRADOURO.apply(correct_string)
all_data.LATITUDE = all_data.LATITUDE.astype(str).apply(correct_string).astype(float)
all_data.LONGITUDE = all_data.LONGITUDE.astype(str).apply(correct_string).astype(float)
# TRANSFORMANDO DADOS TEMPORAIS
def tratarDatas(df, col = 'DATAOCORRENCIA'):
df[col] = pd.to_datetime(df[col], format='%d/%m/%Y', errors='coerce')
indexs = []
for row in df[col].items():
if row[1] < pd.datetime(2010, 1, 1):
indexs.append(row[0])
df.drop(index=indexs, axis=1, inplace=True)
df['diasemana'] = df[col].dt.weekday_name
df['mesano'] = df[col].dt.strftime('%Y-%m')
df['mes'] = df[col].dt.month
df.drop(index=df[df['mesano'] == 'NaT'].index, inplace=True, axis=0)
return df
all_data = tratarDatas(all_data)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment