Skip to content

Instantly share code, notes, and snippets.

@GuidoTournois
Last active May 30, 2022 09:25
Show Gist options
  • Save GuidoTournois/7215992210bbf375b082dae608911fee to your computer and use it in GitHub Desktop.
Save GuidoTournois/7215992210bbf375b082dae608911fee to your computer and use it in GitHub Desktop.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Populating the interactive namespace from numpy and matplotlib\n"
]
}
],
"source": [
"%pylab inline"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import random\n",
"from sys import getsizeof"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('data/yellow_tripdata_2015-01.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"For the sake of demonstration the benefits from Pandas' category, let's add a random pickup neighbourhood to each row "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"nyc_neighbourhoods = [line.rstrip() \n",
" for line \n",
" in open('nyc_neighbourhoods.txt')]\n",
"df['pickup_neighbourhood'] = df.VendorID.apply(\n",
" lambda x: random.choice(nyc_neighbourhoods))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataframe size: 4.93 GB\n"
]
}
],
"source": [
"start_size = getsizeof(df)/(1024.0**3)\n",
"print('Dataframe size: %2.2f GB'%start_size)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"VendorID int64\n",
"tpep_pickup_datetime object\n",
"tpep_dropoff_datetime object\n",
"passenger_count int64\n",
"trip_distance float64\n",
"pickup_longitude float64\n",
"pickup_latitude float64\n",
"RateCodeID int64\n",
"store_and_fwd_flag object\n",
"dropoff_longitude float64\n",
"dropoff_latitude float64\n",
"payment_type int64\n",
"fare_amount float64\n",
"extra float64\n",
"mta_tax float64\n",
"tip_amount float64\n",
"tolls_amount float64\n",
"improvement_surcharge float64\n",
"total_amount float64\n",
"pickup_neighbourhood object\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Integers"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# VendorID is either 1 or 2, so boolean suffices \n",
"df.VendorID = df.VendorID.apply(lambda x: x==2) \n",
"\n",
"# passenger_count, RateCodeID and payment_type contain 0<x<65535\n",
"df.passenger_count = df.passenger_count.astype('uint8')\n",
"df.RateCodeID = df.RateCodeID.astype('uint8') \n",
"df.payment_type = df.payment_type.astype('uint8')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Convert Dollars to cents"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"monetary_columns = ['fare_amount','tip_amount',\n",
" 'total_amount','tolls_amount','extra']\n",
"df[monetary_columns] = \\\n",
" df[monetary_columns].apply(lambda row: (row*100).astype('uint8'))"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataframe size: 4.18 GB\n"
]
}
],
"source": [
"print('Dataframe size: %2.2f GB'%(getsizeof(df)/(1024.0**3)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Floats"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"location_columns = ['pickup_latitude','pickup_longitude',\n",
" 'dropoff_latitude','dropoff_longitude']\n",
"df[location_columns] = df[location_columns].astype('float32') "
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# 0.0<trip_distance<1.54e+07meters, so convert to km\n",
"df.trip_distance = (df.trip_distance/1000).astype('float16') \n",
"\n",
"# only 0.0 and 0.3 occur\n",
"df.improvement_surcharge = df.improvement_surcharge.apply(lambda x: x==0.3)\n",
"\n",
"# Precision of float32 is sufficient for lat and lon\n",
"location_columns = ['pickup_latitude','pickup_longitude',\n",
" 'dropoff_latitude','dropoff_longitude']\n",
"df[location_columns] = df[location_columns].astype('float32') "
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataframe size: 3.83 GB\n"
]
}
],
"source": [
"print('Dataframe size: %2.2f GB'%(getsizeof(df)/(1024.0**3)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Object"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"# store_and_fwd_flag contains Y or N\n",
"df.store_and_fwd_flag = df.store_and_fwd_flag.apply(lambda x: x=='Y')\n",
"\n",
"# Convert string to datetime64[ns]\n",
"date_time_columns = ['tpep_pickup_datetime','tpep_dropoff_datetime']\n",
"for col in date_time_columns:\n",
" df[col] = pd.to_datetime(df[col])"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataframe size: 1.45 GB\n"
]
}
],
"source": [
"print('Dataframe size: %2.2f GB'%(getsizeof(df)/(1024.0**3)))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Categories"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df.mta_tax = df.mta_tax.astype('category')\n",
"df.payment_type = df.payment_type.astype('category')\n",
"df.pickup_neighbourhood = df.pickup_neighbourhood.astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dataframe size: 0.57 GB\n"
]
}
],
"source": [
"final_size = getsizeof(df)/(1024.0**3)\n",
"print('Dataframe size: %2.2f GB'%final_size)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Total reduction: 88.4%!"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"total size reduction: 88.4\n"
]
}
],
"source": [
"print('total size reduction: %2.1f'%((1-final_size/start_size)*100))"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"VendorID bool\n",
"tpep_pickup_datetime datetime64[ns]\n",
"tpep_dropoff_datetime datetime64[ns]\n",
"passenger_count uint8\n",
"trip_distance float16\n",
"pickup_longitude float32\n",
"pickup_latitude float32\n",
"RateCodeID uint8\n",
"store_and_fwd_flag bool\n",
"dropoff_longitude float32\n",
"dropoff_latitude float32\n",
"payment_type category\n",
"fare_amount uint8\n",
"extra uint8\n",
"mta_tax category\n",
"tip_amount uint8\n",
"tolls_amount uint8\n",
"improvement_surcharge bool\n",
"total_amount uint8\n",
"pickup_neighbourhood category\n",
"dtype: object"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.5.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
@vmuriart
Copy link

On the datatypes suggestion, be careful using float16 as not all operations are supported in pandas. pandas-dev/pandas#9220

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment