Created
April 7, 2016 20:53
-
-
Save tbueno/8bf7ed913edbacc758e51bc6ade79a2f to your computer and use it in GitHub Desktop.
unique id approach
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
require "csv" | |
require "fileutils" | |
require "tempfile" | |
require "sequel" | |
GTFS_TABLES = %w(agency routes stops trips stop_times shapes calendar calendar_dates transfers) | |
class DataLoader | |
def initialize(db, folder) | |
@folder = folder | |
@db = db | |
end | |
def load_csv(file_name) | |
csv_file = File.join(@folder, "#{file_name}.txt") | |
return unless File.exist?(csv_file) | |
headers = CSV.open(csv_file, "r:bom|utf-8", &:first) | |
begin | |
@db.copy_into("#{file_name}_temp".to_sym, columns: Sequel.lit(headers.join(",")), | |
data: File.read(csv_file), | |
options: "FORMAT csv, HEADER TRUE") | |
rescue StandardError => e | |
puts "ERROR parsing #{@folder}" | |
raise e | |
end | |
end | |
end | |
class Generator | |
def initialize(db) | |
@db = db | |
end | |
def run!(sources_folder) | |
empty_database! | |
puts "Archiving content...." | |
load_data!(sources_folder) | |
end | |
def empty_database! | |
GTFS_TABLES.each { |table| | |
@db.run "TRUNCATE #{table};" | |
} | |
end | |
def load_data!(region_folder) | |
raise "Please inform an source path" if region_folder.empty? | |
FileUtils.mkdir_p "tmp" | |
drop_temp_tables | |
create_temp_tables | |
Dir["#{region_folder}/*"].each do |folder| | |
load_region(folder) | |
end | |
drop_temp_tables | |
puts "Done." | |
end | |
private | |
def create_temp_tables | |
GTFS_TABLES.each do |table| | |
temp_table = "#{table}_temp" | |
@db.run "CREATE TABLE #{temp_table} (LIKE #{table} INCLUDING ALL)" | |
end | |
end | |
def drop_temp_tables | |
GTFS_TABLES.each do |table| | |
temp_table = "#{table}_temp" | |
@db.run "DROP TABLE IF EXISTS #{temp_table}" | |
end | |
end | |
def truncate_temp_tables | |
GTFS_TABLES.each do |table| | |
temp_table = "#{table}_temp" | |
@db.run "TRUNCATE #{temp_table}" | |
end | |
end | |
def load_region(region) | |
region_name = region.split("/").last | |
puts region_name | |
Dir["#{region}/*"].each do |dataset| | |
insert_data(dataset) | |
end | |
end | |
# Runs once for every zip file inside a region name | |
def insert_data(folder) | |
puts "\t #{folder}" | |
loader = DataLoader.new(@db, folder) | |
agency = CSV.open(folder + "/agency.txt", "r:bom|utf-8", headers: true, &:first) | |
agency_id = agency["agency_id"] || agency["agency_name"][0,3] | |
table = "agency" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "routes" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
# Guarantees that agency id is set in routes | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET route_id='#{agency_id}:' || route_id | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "stops" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
# Update stops ids appending agency_id | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET stop_id='#{agency_id}:' || stop_id, parent_station='#{agency_id}:' || parent_station | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "trips" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
# Update trips ids appending agency_id | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET trip_id='#{agency_id}:' || trip_id, | |
route_id='#{agency_id}:' || route_id, | |
shape_id='#{agency_id}:' || shape_id | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
# table = "stop_times" | |
# temp_table = temp_table = "#{table}_temp" | |
# loader.load_csv(table) | |
# # Update stop_times ids appending agency_id | |
# @db.run <<-SQL | |
# UPDATE #{temp_table} | |
# SET trip_id='#{agency_id}:' || trip_id, | |
# stop_id='#{agency_id}:' || stop_id | |
# SQL | |
# @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "calendar" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET service_id='#{agency_id}:' || service_id | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "calendar_dates" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET service_id='#{agency_id}:' || service_id | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "shapes" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET shape_id='#{agency_id}:' || shape_id | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
table = "transfers" | |
temp_table = temp_table = "#{table}_temp" | |
loader.load_csv(table) | |
@db.run <<-SQL | |
UPDATE #{temp_table} | |
SET from_stop_id='#{agency_id}:' || from_stop_id, | |
to_stop_id='#{agency_id}:' || to_stop_id | |
SQL | |
@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}" | |
truncate_temp_tables | |
end | |
end | |
Generator.new(Sequel.connect("postgres://localhost/archive")).run!(ARGV[0]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
DROP TABLE AGENCY; | |
DROP TABLE ROUTES; | |
DROP TABLE STOPS; | |
DROP TABLE TRIPS; | |
DROP TABLE STOP_TIMES; | |
DROP TABLE SHAPES; | |
DROP TABLE CALENDAR; | |
DROP TABLE CALENDAR_DATES; | |
DROP TABLE TRANSFERS; | |
CREATE TABLE AGENCY | |
( | |
agency_id VARCHAR(50) UNIQUE, | |
agency_name VARCHAR(80), | |
agency_url VARCHAR(100), | |
agency_timezone VARCHAR(50), | |
agency_lang VARCHAR(2), | |
agency_fare_url VARCHAR(100), | |
agency_phone VARCHAR(20) | |
); | |
CREATE TABLE ROUTES | |
( | |
route_id VARCHAR(100) PRIMARY KEY, | |
agency_id VARCHAR(50), | |
route_short_name VARCHAR(20), | |
route_long_name VARCHAR(100), | |
route_desc VARCHAR(100), | |
route_type NUMERIC(4), | |
route_url VARCHAR(100), | |
route_color VARCHAR(8), | |
route_text_color VARCHAR(8) | |
); | |
CREATE TABLE STOPS | |
( | |
stop_id VARCHAR(100) PRIMARY KEY, | |
stop_code VARCHAR(10), | |
stop_name VARCHAR(100), | |
stop_desc VARCHAR(100), | |
stop_lat NUMERIC(38,8), | |
stop_lon NUMERIC(38,8), | |
zone_id VARCHAR(5), | |
stop_url VARCHAR(100), | |
location_type NUMERIC(2), | |
stop_timezone VARCHAR(50), | |
parent_station VARCHAR(100), | |
wheelchair_boarding NUMERIC(1) | |
); | |
CREATE TABLE TRIPS | |
( | |
trip_id VARCHAR(100) PRIMARY KEY, | |
route_id VARCHAR(100), | |
service_id VARCHAR(50), | |
trip_headsign VARCHAR(50), | |
trip_short_name VARCHAR(30), | |
direction_id NUMERIC(2), | |
block_id NUMERIC(10), | |
shape_id VARCHAR(100), | |
wheelchair_accessible NUMERIC(1), | |
bikes_allowed NUMERIC(1) | |
); | |
CREATE TABLE STOP_TIMES | |
( | |
trip_id VARCHAR(50), | |
arrival_time VARCHAR(10), | |
departure_time VARCHAR(10), | |
stop_id VARCHAR(100), | |
stop_sequence NUMERIC(10), | |
stop_headsign VARCHAR(30), | |
pickup_type VARCHAR(100), | |
drop_off_type VARCHAR(100), | |
shape_dist_traveled NUMERIC, | |
timepoint NUMERIC(1) | |
); | |
CREATE TABLE SHAPES | |
( | |
shape_id VARCHAR(100), | |
shape_pt_lat NUMERIC, | |
shape_pt_lon NUMERIC, | |
shape_pt_sequence NUMERIC(6), | |
shape_dist_traveled NUMERIC | |
); | |
CREATE TABLE CALENDAR_DATES | |
( | |
service_id VARCHAR(100), | |
date DATE, | |
exception_type VARCHAR(10) | |
); | |
CREATE TABLE CALENDAR | |
( | |
service_id VARCHAR(100), | |
monday NUMERIC(1), | |
tuesday NUMERIC(1), | |
wednesday NUMERIC(1), | |
thursday NUMERIC(1), | |
friday NUMERIC(1), | |
saturday NUMERIC(1), | |
sunday NUMERIC(1), | |
start_date DATE, | |
end_date DATE | |
); | |
CREATE TABLE TRANSFERS | |
( | |
from_stop_id VARCHAR(100), | |
to_stop_id VARCHAR(100), | |
transfer_type NUMERIC(3), | |
min_transfer_time NUMERIC(6) | |
); | |
-- Guarantee agency_id is set | |
DROP FUNCTION IF EXISTS update_agency_id(); | |
CREATE FUNCTION update_agency_id() RETURNS trigger AS $$ | |
BEGIN | |
NEW.agency_id := COALESCE(NEW.agency_id, substring(NEW.agency_name from 1 for 3)); | |
RETURN NEW; | |
END; | |
$$ LANGUAGE plpgsql; | |
CREATE TRIGGER agency_id_trigger | |
BEFORE INSERT OR UPDATE | |
ON agency | |
FOR EACH ROW | |
EXECUTE PROCEDURE update_agency_id(); |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
source "https://rubygems.org" | |
ruby "2.3.0" | |
gem "sequel" | |
gem "pg" | |
gem "rubyzip" | |
group :development, :test do | |
gem "rake" | |
gem "pry-byebug" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment