Last active
May 30, 2022 09:25
-
-
Save GuidoTournois/7215992210bbf375b082dae608911fee to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Populating the interactive namespace from numpy and matplotlib\n" | |
] | |
} | |
], | |
"source": [ | |
"%pylab inline" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"import pandas as pd\n", | |
"import random\n", | |
"from sys import getsizeof" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df = pd.read_csv('data/yellow_tripdata_2015-01.csv')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"For the sake of demonstration the benefits from Pandas' category, let's add a random pickup neighbourhood to each row " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"nyc_neighbourhoods = [line.rstrip() \n", | |
" for line \n", | |
" in open('nyc_neighbourhoods.txt')]\n", | |
"df['pickup_neighbourhood'] = df.VendorID.apply(\n", | |
" lambda x: random.choice(nyc_neighbourhoods))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Dataframe size: 4.93 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"start_size = getsizeof(df)/(1024.0**3)\n", | |
"print('Dataframe size: %2.2f GB'%start_size)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"VendorID int64\n", | |
"tpep_pickup_datetime object\n", | |
"tpep_dropoff_datetime object\n", | |
"passenger_count int64\n", | |
"trip_distance float64\n", | |
"pickup_longitude float64\n", | |
"pickup_latitude float64\n", | |
"RateCodeID int64\n", | |
"store_and_fwd_flag object\n", | |
"dropoff_longitude float64\n", | |
"dropoff_latitude float64\n", | |
"payment_type int64\n", | |
"fare_amount float64\n", | |
"extra float64\n", | |
"mta_tax float64\n", | |
"tip_amount float64\n", | |
"tolls_amount float64\n", | |
"improvement_surcharge float64\n", | |
"total_amount float64\n", | |
"pickup_neighbourhood object\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Integers" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# VendorID is either 1 or 2, so boolean suffices \n", | |
"df.VendorID = df.VendorID.apply(lambda x: x==2) \n", | |
"\n", | |
"# passenger_count, RateCodeID and payment_type contain 0<x<65535\n", | |
"df.passenger_count = df.passenger_count.astype('uint8')\n", | |
"df.RateCodeID = df.RateCodeID.astype('uint8') \n", | |
"df.payment_type = df.payment_type.astype('uint8')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"### Convert Dollars to cents" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"monetary_columns = ['fare_amount','tip_amount',\n", | |
" 'total_amount','tolls_amount','extra']\n", | |
"df[monetary_columns] = \\\n", | |
" df[monetary_columns].apply(lambda row: (row*100).astype('uint8'))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Dataframe size: 4.18 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print('Dataframe size: %2.2f GB'%(getsizeof(df)/(1024.0**3)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Floats" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 10, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"location_columns = ['pickup_latitude','pickup_longitude',\n", | |
" 'dropoff_latitude','dropoff_longitude']\n", | |
"df[location_columns] = df[location_columns].astype('float32') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# 0.0<trip_distance<1.54e+07meters, so convert to km\n", | |
"df.trip_distance = (df.trip_distance/1000).astype('float16') \n", | |
"\n", | |
"# only 0.0 and 0.3 occur\n", | |
"df.improvement_surcharge = df.improvement_surcharge.apply(lambda x: x==0.3)\n", | |
"\n", | |
"# Precision of float32 is sufficient for lat and lon\n", | |
"location_columns = ['pickup_latitude','pickup_longitude',\n", | |
" 'dropoff_latitude','dropoff_longitude']\n", | |
"df[location_columns] = df[location_columns].astype('float32') " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Dataframe size: 3.83 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print('Dataframe size: %2.2f GB'%(getsizeof(df)/(1024.0**3)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Object" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# store_and_fwd_flag contains Y or N\n", | |
"df.store_and_fwd_flag = df.store_and_fwd_flag.apply(lambda x: x=='Y')\n", | |
"\n", | |
"# Convert string to datetime64[ns]\n", | |
"date_time_columns = ['tpep_pickup_datetime','tpep_dropoff_datetime']\n", | |
"for col in date_time_columns:\n", | |
" df[col] = pd.to_datetime(df[col])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Dataframe size: 1.45 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"print('Dataframe size: %2.2f GB'%(getsizeof(df)/(1024.0**3)))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Categories" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"df.mta_tax = df.mta_tax.astype('category')\n", | |
"df.payment_type = df.payment_type.astype('category')\n", | |
"df.pickup_neighbourhood = df.pickup_neighbourhood.astype('category')" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Dataframe size: 0.57 GB\n" | |
] | |
} | |
], | |
"source": [ | |
"final_size = getsizeof(df)/(1024.0**3)\n", | |
"print('Dataframe size: %2.2f GB'%final_size)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Total reduction: 88.4%!" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"total size reduction: 88.4\n" | |
] | |
} | |
], | |
"source": [ | |
"print('total size reduction: %2.1f'%((1-final_size/start_size)*100))" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"VendorID bool\n", | |
"tpep_pickup_datetime datetime64[ns]\n", | |
"tpep_dropoff_datetime datetime64[ns]\n", | |
"passenger_count uint8\n", | |
"trip_distance float16\n", | |
"pickup_longitude float32\n", | |
"pickup_latitude float32\n", | |
"RateCodeID uint8\n", | |
"store_and_fwd_flag bool\n", | |
"dropoff_longitude float32\n", | |
"dropoff_latitude float32\n", | |
"payment_type category\n", | |
"fare_amount uint8\n", | |
"extra uint8\n", | |
"mta_tax category\n", | |
"tip_amount uint8\n", | |
"tolls_amount uint8\n", | |
"improvement_surcharge bool\n", | |
"total_amount uint8\n", | |
"pickup_neighbourhood category\n", | |
"dtype: object" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"df.dtypes" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.5.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
On the datatypes suggestion, be careful using
float16
as not all operations are supported in pandas. pandas-dev/pandas#9220