Skip to content

Instantly share code, notes, and snippets.

@tbueno
Created April 7, 2016 20:53
Show Gist options
  • Save tbueno/8bf7ed913edbacc758e51bc6ade79a2f to your computer and use it in GitHub Desktop.
Save tbueno/8bf7ed913edbacc758e51bc6ade79a2f to your computer and use it in GitHub Desktop.
unique id approach
require "csv"
require "fileutils"
require "tempfile"
require "sequel"
GTFS_TABLES = %w(agency routes stops trips stop_times shapes calendar calendar_dates transfers)
class DataLoader
def initialize(db, folder)
@folder = folder
@db = db
end
def load_csv(file_name)
csv_file = File.join(@folder, "#{file_name}.txt")
return unless File.exist?(csv_file)
headers = CSV.open(csv_file, "r:bom|utf-8", &:first)
begin
@db.copy_into("#{file_name}_temp".to_sym, columns: Sequel.lit(headers.join(",")),
data: File.read(csv_file),
options: "FORMAT csv, HEADER TRUE")
rescue StandardError => e
puts "ERROR parsing #{@folder}"
raise e
end
end
end
class Generator
def initialize(db)
@db = db
end
def run!(sources_folder)
empty_database!
puts "Archiving content...."
load_data!(sources_folder)
end
def empty_database!
GTFS_TABLES.each { |table|
@db.run "TRUNCATE #{table};"
}
end
def load_data!(region_folder)
raise "Please inform an source path" if region_folder.empty?
FileUtils.mkdir_p "tmp"
drop_temp_tables
create_temp_tables
Dir["#{region_folder}/*"].each do |folder|
load_region(folder)
end
drop_temp_tables
puts "Done."
end
private
def create_temp_tables
GTFS_TABLES.each do |table|
temp_table = "#{table}_temp"
@db.run "CREATE TABLE #{temp_table} (LIKE #{table} INCLUDING ALL)"
end
end
def drop_temp_tables
GTFS_TABLES.each do |table|
temp_table = "#{table}_temp"
@db.run "DROP TABLE IF EXISTS #{temp_table}"
end
end
def truncate_temp_tables
GTFS_TABLES.each do |table|
temp_table = "#{table}_temp"
@db.run "TRUNCATE #{temp_table}"
end
end
def load_region(region)
region_name = region.split("/").last
puts region_name
Dir["#{region}/*"].each do |dataset|
insert_data(dataset)
end
end
# Runs once for every zip file inside a region name
def insert_data(folder)
puts "\t #{folder}"
loader = DataLoader.new(@db, folder)
agency = CSV.open(folder + "/agency.txt", "r:bom|utf-8", headers: true, &:first)
agency_id = agency["agency_id"] || agency["agency_name"][0,3]
table = "agency"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "routes"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
# Guarantees that agency id is set in routes
@db.run <<-SQL
UPDATE #{temp_table}
SET route_id='#{agency_id}:' || route_id
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "stops"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
# Update stops ids appending agency_id
@db.run <<-SQL
UPDATE #{temp_table}
SET stop_id='#{agency_id}:' || stop_id, parent_station='#{agency_id}:' || parent_station
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "trips"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
# Update trips ids appending agency_id
@db.run <<-SQL
UPDATE #{temp_table}
SET trip_id='#{agency_id}:' || trip_id,
route_id='#{agency_id}:' || route_id,
shape_id='#{agency_id}:' || shape_id
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
# table = "stop_times"
# temp_table = temp_table = "#{table}_temp"
# loader.load_csv(table)
# # Update stop_times ids appending agency_id
# @db.run <<-SQL
# UPDATE #{temp_table}
# SET trip_id='#{agency_id}:' || trip_id,
# stop_id='#{agency_id}:' || stop_id
# SQL
# @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "calendar"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
@db.run <<-SQL
UPDATE #{temp_table}
SET service_id='#{agency_id}:' || service_id
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "calendar_dates"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
@db.run <<-SQL
UPDATE #{temp_table}
SET service_id='#{agency_id}:' || service_id
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "shapes"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
@db.run <<-SQL
UPDATE #{temp_table}
SET shape_id='#{agency_id}:' || shape_id
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
table = "transfers"
temp_table = temp_table = "#{table}_temp"
loader.load_csv(table)
@db.run <<-SQL
UPDATE #{temp_table}
SET from_stop_id='#{agency_id}:' || from_stop_id,
to_stop_id='#{agency_id}:' || to_stop_id
SQL
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"
truncate_temp_tables
end
end
Generator.new(Sequel.connect("postgres://localhost/archive")).run!(ARGV[0])
DROP TABLE AGENCY;
DROP TABLE ROUTES;
DROP TABLE STOPS;
DROP TABLE TRIPS;
DROP TABLE STOP_TIMES;
DROP TABLE SHAPES;
DROP TABLE CALENDAR;
DROP TABLE CALENDAR_DATES;
DROP TABLE TRANSFERS;
CREATE TABLE AGENCY
(
agency_id VARCHAR(50) UNIQUE,
agency_name VARCHAR(80),
agency_url VARCHAR(100),
agency_timezone VARCHAR(50),
agency_lang VARCHAR(2),
agency_fare_url VARCHAR(100),
agency_phone VARCHAR(20)
);
CREATE TABLE ROUTES
(
route_id VARCHAR(100) PRIMARY KEY,
agency_id VARCHAR(50),
route_short_name VARCHAR(20),
route_long_name VARCHAR(100),
route_desc VARCHAR(100),
route_type NUMERIC(4),
route_url VARCHAR(100),
route_color VARCHAR(8),
route_text_color VARCHAR(8)
);
CREATE TABLE STOPS
(
stop_id VARCHAR(100) PRIMARY KEY,
stop_code VARCHAR(10),
stop_name VARCHAR(100),
stop_desc VARCHAR(100),
stop_lat NUMERIC(38,8),
stop_lon NUMERIC(38,8),
zone_id VARCHAR(5),
stop_url VARCHAR(100),
location_type NUMERIC(2),
stop_timezone VARCHAR(50),
parent_station VARCHAR(100),
wheelchair_boarding NUMERIC(1)
);
CREATE TABLE TRIPS
(
trip_id VARCHAR(100) PRIMARY KEY,
route_id VARCHAR(100),
service_id VARCHAR(50),
trip_headsign VARCHAR(50),
trip_short_name VARCHAR(30),
direction_id NUMERIC(2),
block_id NUMERIC(10),
shape_id VARCHAR(100),
wheelchair_accessible NUMERIC(1),
bikes_allowed NUMERIC(1)
);
CREATE TABLE STOP_TIMES
(
trip_id VARCHAR(50),
arrival_time VARCHAR(10),
departure_time VARCHAR(10),
stop_id VARCHAR(100),
stop_sequence NUMERIC(10),
stop_headsign VARCHAR(30),
pickup_type VARCHAR(100),
drop_off_type VARCHAR(100),
shape_dist_traveled NUMERIC,
timepoint NUMERIC(1)
);
CREATE TABLE SHAPES
(
shape_id VARCHAR(100),
shape_pt_lat NUMERIC,
shape_pt_lon NUMERIC,
shape_pt_sequence NUMERIC(6),
shape_dist_traveled NUMERIC
);
CREATE TABLE CALENDAR_DATES
(
service_id VARCHAR(100),
date DATE,
exception_type VARCHAR(10)
);
CREATE TABLE CALENDAR
(
service_id VARCHAR(100),
monday NUMERIC(1),
tuesday NUMERIC(1),
wednesday NUMERIC(1),
thursday NUMERIC(1),
friday NUMERIC(1),
saturday NUMERIC(1),
sunday NUMERIC(1),
start_date DATE,
end_date DATE
);
CREATE TABLE TRANSFERS
(
from_stop_id VARCHAR(100),
to_stop_id VARCHAR(100),
transfer_type NUMERIC(3),
min_transfer_time NUMERIC(6)
);
-- Guarantee agency_id is set
DROP FUNCTION IF EXISTS update_agency_id();
CREATE FUNCTION update_agency_id() RETURNS trigger AS $$
BEGIN
NEW.agency_id := COALESCE(NEW.agency_id, substring(NEW.agency_name from 1 for 3));
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER agency_id_trigger
BEFORE INSERT OR UPDATE
ON agency
FOR EACH ROW
EXECUTE PROCEDURE update_agency_id();
source "https://rubygems.org"
ruby "2.3.0"
gem "sequel"
gem "pg"
gem "rubyzip"
group :development, :test do
gem "rake"
gem "pry-byebug"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment