Skip to content

Instantly share code, notes, and snippets.

@pramsey
Created February 12, 2025 23:39
Show Gist options
  • Save pramsey/2456fe4b3cc8cd75b773159dcc68929a to your computer and use it in GitHub Desktop.
Save pramsey/2456fe4b3cc8cd75b773159dcc68929a to your computer and use it in GitHub Desktop.
Some Demo Steps for Taxi Data
--
-- Create an Iceberg table by importing data directly from
-- parquet files.
--
CREATE EXTENSION pg_parquet;
CREATE TABLE trips_yellow ()
USING iceberg
WITH (load_from = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet');
-- Add some more records for fun.
-- COPY trips_yellow FROM 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet';
-- COPY trips_yellow FROM 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet';
-- Number of records
SELECT Count(*) FROM trips_yellow;
-- Number of trips by hour
SELECT
Extract(hour FROM tpep_pickup_datetime) AS hour,
Count(1) AS count
FROM trips_yellow
GROUP BY hour
ORDER BY hour;
-- Percentage % of trips by hour
SELECT
Extract(hour FROM tpep_pickup_datetime) AS hour,
round(100.0 * Count(1) / (Sum(Count(1)) OVER ()),1) || '%' AS pct
FROM trips_yellow
GROUP BY hour
ORDER BY hour;
-- Get some mapping data
CREATE EXTENSION crunchy_spatial_analytics CASCADE;
-- Aha, mapping data that can be joined to taxi data!
CREATE foreign table taxi_zones ()
SERVER crunchy_lake_analytics
OPTIONS (
FORMAT 'gdal',
COMPRESSION 'zip',
PATH 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip'
);
-- So the map data can be lined up magically to the
-- base map in QGIS, we ensure the column SRID is set
ALTER TABLE taxi_zones
ALTER COLUMN geom
TYPE Geometry(MultiPolygon, 2263);
-- Have a look in QGIS
-- Click the "Use estimated table metadata" option to avoid
-- QGIS doing full table scans (bad)
-- Set the "SSL Mode" to "prefer"
-- Fill the user/passwd into the Authentication>Basic tab
-- Load up the taxi_zones table
-- Well, you can see zones, but we want taxi data ON zones
--
-- Summarize taxi data to hour/pickup zone basis
-- Then join that to the zones and filter out just the
-- hour of the day we are interested in (6PM here)
--
CREATE MATERIALIZED VIEW taxi_tips_18 AS
WITH stats AS
(
SELECT
Sum(tip_amount) AS tips,
Sum(trip_distance) AS distance,
Count(1) AS trips,
pulocationid AS locationid,
Extract(hour FROM tpep_pickup_datetime) AS hour
FROM trips_yellow
GROUP BY hour, locationid
HAVING Sum(trip_distance) > 0
)
SELECT
stats.locationid,
stats.hour,
stats.distance/stats.trips AS miles_per_trip,
stats.tips/stats.trips AS tips_per_trip,
z.geom::Geometry(MultiPolygon, 2263) AS geom
FROM stats
JOIN taxi_zones z
ON stats.locationid = z.locationid;
WHERE stats.hour = 18;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment