Created
February 12, 2025 23:39
-
-
Save pramsey/2456fe4b3cc8cd75b773159dcc68929a to your computer and use it in GitHub Desktop.
Some Demo Steps for Taxi Data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-- | |
-- Create an Iceberg table by importing data directly from | |
-- parquet files. | |
-- | |
CREATE EXTENSION pg_parquet; | |
CREATE TABLE trips_yellow () | |
USING iceberg | |
WITH (load_from = 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet'); | |
-- Add some more records for fun. | |
-- COPY trips_yellow FROM 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet'; | |
-- COPY trips_yellow FROM 'https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet'; | |
-- Number of records | |
SELECT Count(*) FROM trips_yellow; | |
-- Number of trips by hour | |
SELECT | |
Extract(hour FROM tpep_pickup_datetime) AS hour, | |
Count(1) AS count | |
FROM trips_yellow | |
GROUP BY hour | |
ORDER BY hour; | |
-- Percentage % of trips by hour | |
SELECT | |
Extract(hour FROM tpep_pickup_datetime) AS hour, | |
round(100.0 * Count(1) / (Sum(Count(1)) OVER ()),1) || '%' AS pct | |
FROM trips_yellow | |
GROUP BY hour | |
ORDER BY hour; | |
-- Get some mapping data | |
CREATE EXTENSION crunchy_spatial_analytics CASCADE; | |
-- Aha, mapping data that can be joined to taxi data! | |
CREATE foreign table taxi_zones () | |
SERVER crunchy_lake_analytics | |
OPTIONS ( | |
FORMAT 'gdal', | |
COMPRESSION 'zip', | |
PATH 'https://d37ci6vzurychx.cloudfront.net/misc/taxi_zones.zip' | |
); | |
-- So the map data can be lined up magically to the | |
-- base map in QGIS, we ensure the column SRID is set | |
ALTER TABLE taxi_zones | |
ALTER COLUMN geom | |
TYPE Geometry(MultiPolygon, 2263); | |
-- Have a look in QGIS | |
-- Click the "Use estimated table metadata" option to avoid | |
-- QGIS doing full table scans (bad) | |
-- Set the "SSL Mode" to "prefer" | |
-- Fill the user/passwd into the Authentication>Basic tab | |
-- Load up the taxi_zones table | |
-- Well, you can see zones, but we want taxi data ON zones | |
-- | |
-- Summarize taxi data to hour/pickup zone basis | |
-- Then join that to the zones and filter out just the | |
-- hour of the day we are interested in (6PM here) | |
-- | |
CREATE MATERIALIZED VIEW taxi_tips_18 AS | |
WITH stats AS | |
( | |
SELECT | |
Sum(tip_amount) AS tips, | |
Sum(trip_distance) AS distance, | |
Count(1) AS trips, | |
pulocationid AS locationid, | |
Extract(hour FROM tpep_pickup_datetime) AS hour | |
FROM trips_yellow | |
GROUP BY hour, locationid | |
HAVING Sum(trip_distance) > 0 | |
) | |
SELECT | |
stats.locationid, | |
stats.hour, | |
stats.distance/stats.trips AS miles_per_trip, | |
stats.tips/stats.trips AS tips_per_trip, | |
z.geom::Geometry(MultiPolygon, 2263) AS geom | |
FROM stats | |
JOIN taxi_zones z | |
ON stats.locationid = z.locationid; | |
WHERE stats.hour = 18; | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment