tbueno · April 7, 2016 20:53
diff --git a/archive.rb b/archive.rb
 require "csv"
 require "fileutils"
 require "tempfile"
 require "sequel"

 GTFS_TABLES = %w(agency routes stops trips stop_times shapes calendar calendar_dates transfers)

 class DataLoader
  def initialize(db, folder)
    @folder = folder
    @db = db
  end

  def load_csv(file_name)
    csv_file = File.join(@folder, "#{file_name}.txt")
    return unless File.exist?(csv_file)

    headers = CSV.open(csv_file, "r:bom|utf-8", &:first)
    begin
      @db.copy_into("#{file_name}_temp".to_sym, columns: Sequel.lit(headers.join(",")),
       data: File.read(csv_file),
       options: "FORMAT csv, HEADER TRUE")
    rescue StandardError => e
      puts "ERROR parsing #{@folder}"
      raise e
    end
  end
 end

 class Generator

  def initialize(db)
    @db = db
  end

  def run!(sources_folder)

    empty_database!
    puts "Archiving content...."
    load_data!(sources_folder)
  end

  def empty_database!
    GTFS_TABLES.each { |table|
      @db.run "TRUNCATE #{table};"
    }
  end

  def load_data!(region_folder)
    raise "Please inform an source path" if region_folder.empty?

    FileUtils.mkdir_p "tmp"

    drop_temp_tables
    create_temp_tables

    Dir["#{region_folder}/*"].each do |folder|
      load_region(folder)
    end

    drop_temp_tables
    puts "Done."
  end

  private

  def create_temp_tables
    GTFS_TABLES.each do |table|
      temp_table = "#{table}_temp"
      @db.run "CREATE TABLE #{temp_table} (LIKE #{table} INCLUDING ALL)"
    end
  end

  def drop_temp_tables
    GTFS_TABLES.each do |table|
      temp_table = "#{table}_temp"
      @db.run "DROP TABLE IF EXISTS #{temp_table}"
    end
  end

  def truncate_temp_tables
    GTFS_TABLES.each do |table|
      temp_table = "#{table}_temp"
      @db.run "TRUNCATE #{temp_table}"
    end
  end

  def load_region(region)
    region_name = region.split("/").last
    puts region_name
    Dir["#{region}/*"].each do |dataset|
      insert_data(dataset)
    end
  end


  # Runs once for every zip file inside a region name
  def insert_data(folder)
    puts "\t #{folder}"
    loader = DataLoader.new(@db, folder)
    agency = CSV.open(folder + "/agency.txt", "r:bom|utf-8", headers: true, &:first)
    agency_id = agency["agency_id"] || agency["agency_name"][0,3]


    table = "agency"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "routes"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    # Guarantees that agency id is set in routes
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET route_id='#{agency_id}:' || route_id
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "stops"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    # Update stops ids appending agency_id
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET stop_id='#{agency_id}:' || stop_id, parent_station='#{agency_id}:' || parent_station
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "trips"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    # Update trips ids appending agency_id
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET trip_id='#{agency_id}:' || trip_id,
          route_id='#{agency_id}:' || route_id,
          shape_id='#{agency_id}:' || shape_id
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    # table = "stop_times"
    # temp_table = temp_table = "#{table}_temp"
    # loader.load_csv(table)
    # # Update stop_times ids appending agency_id
    # @db.run <<-SQL
    #   UPDATE #{temp_table}
    #   SET trip_id='#{agency_id}:' || trip_id,
    #       stop_id='#{agency_id}:' || stop_id
    # SQL
    # @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "calendar"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET service_id='#{agency_id}:' || service_id
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "calendar_dates"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET service_id='#{agency_id}:' || service_id
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "shapes"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET shape_id='#{agency_id}:' || shape_id
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


    table = "transfers"
    temp_table = temp_table = "#{table}_temp"
    loader.load_csv(table)
    @db.run <<-SQL
      UPDATE #{temp_table}
      SET from_stop_id='#{agency_id}:' || from_stop_id,
          to_stop_id='#{agency_id}:' || to_stop_id
    SQL
    @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"

    truncate_temp_tables
  end
 end

 Generator.new(Sequel.connect("postgres://localhost/archive")).run!(ARGV[0])
diff --git a/create_tables.sql b/create_tables.sql
 DROP TABLE AGENCY;
 DROP TABLE ROUTES;
 DROP TABLE STOPS;
 DROP TABLE TRIPS;

 DROP TABLE STOP_TIMES;
 DROP TABLE SHAPES;
 DROP TABLE CALENDAR;
 DROP TABLE CALENDAR_DATES;
 DROP TABLE TRANSFERS;


 CREATE TABLE AGENCY
 (
  agency_id VARCHAR(50) UNIQUE,
  agency_name VARCHAR(80),
  agency_url VARCHAR(100),
  agency_timezone VARCHAR(50),
  agency_lang VARCHAR(2),
  agency_fare_url VARCHAR(100),
  agency_phone VARCHAR(20)
 );


 CREATE TABLE ROUTES
 (
  route_id VARCHAR(100) PRIMARY KEY,
  agency_id VARCHAR(50),
  route_short_name VARCHAR(20),
  route_long_name VARCHAR(100),
  route_desc VARCHAR(100),
  route_type NUMERIC(4),
  route_url VARCHAR(100),
  route_color VARCHAR(8),
  route_text_color VARCHAR(8)
 );


 CREATE TABLE STOPS
 (
  stop_id VARCHAR(100) PRIMARY KEY,
  stop_code VARCHAR(10),
  stop_name VARCHAR(100),
  stop_desc VARCHAR(100),
  stop_lat NUMERIC(38,8),
  stop_lon NUMERIC(38,8),
  zone_id VARCHAR(5),
  stop_url VARCHAR(100),
  location_type NUMERIC(2),
  stop_timezone VARCHAR(50),
  parent_station VARCHAR(100),
  wheelchair_boarding NUMERIC(1)
 );

 CREATE TABLE TRIPS
 (
  trip_id VARCHAR(100) PRIMARY KEY,
  route_id VARCHAR(100),
  service_id VARCHAR(50),
  trip_headsign VARCHAR(50),
  trip_short_name VARCHAR(30),
  direction_id NUMERIC(2),
  block_id NUMERIC(10),
  shape_id VARCHAR(100),
  wheelchair_accessible NUMERIC(1),
  bikes_allowed NUMERIC(1)
 );


 CREATE TABLE STOP_TIMES
 (
  trip_id VARCHAR(50),
  arrival_time VARCHAR(10),
  departure_time VARCHAR(10),
  stop_id VARCHAR(100),
  stop_sequence NUMERIC(10),
  stop_headsign VARCHAR(30),
  pickup_type VARCHAR(100),
  drop_off_type VARCHAR(100),
  shape_dist_traveled NUMERIC,
  timepoint NUMERIC(1)
 );


 CREATE TABLE SHAPES
 (
  shape_id VARCHAR(100),
  shape_pt_lat NUMERIC,
  shape_pt_lon NUMERIC,
  shape_pt_sequence NUMERIC(6),
  shape_dist_traveled NUMERIC
 );


 CREATE TABLE CALENDAR_DATES
 (
  service_id VARCHAR(100),
  date DATE,
  exception_type VARCHAR(10)
 );


 CREATE TABLE CALENDAR
 (
  service_id VARCHAR(100),
  monday NUMERIC(1),
  tuesday NUMERIC(1),
  wednesday NUMERIC(1),
  thursday NUMERIC(1),
  friday NUMERIC(1),
  saturday NUMERIC(1),
  sunday NUMERIC(1),
  start_date DATE,
  end_date DATE
 );


 CREATE TABLE TRANSFERS
 (
  from_stop_id VARCHAR(100),
  to_stop_id VARCHAR(100),
  transfer_type NUMERIC(3),
  min_transfer_time NUMERIC(6)
 );



 -- Guarantee agency_id is set
 DROP FUNCTION IF EXISTS update_agency_id();
 CREATE FUNCTION update_agency_id() RETURNS trigger AS $$
 BEGIN
    NEW.agency_id := COALESCE(NEW.agency_id, substring(NEW.agency_name from 1 for 3));
    RETURN NEW;
 END;
 $$ LANGUAGE plpgsql;
 CREATE TRIGGER agency_id_trigger
  BEFORE INSERT OR UPDATE
  ON agency
  FOR EACH ROW
  EXECUTE PROCEDURE update_agency_id();
diff --git a/Gemfile b/Gemfile
 source "https://rubygems.org"

 ruby "2.3.0"

 gem "sequel"
 gem "pg"
 gem "rubyzip"

 group :development, :test do
  gem "rake"
  gem "pry-byebug"
 end
	require "csv"
	require "fileutils"
	require "tempfile"
	require "sequel"

	GTFS_TABLES = %w(agency routes stops trips stop_times shapes calendar calendar_dates transfers)

	class DataLoader
	def initialize(db, folder)
	@folder = folder
	@db = db
	end

	def load_csv(file_name)
	csv_file = File.join(@folder, "#{file_name}.txt")
	return unless File.exist?(csv_file)

	headers = CSV.open(csv_file, "r:bom\|utf-8", &:first)
	begin
	@db.copy_into("#{file_name}_temp".to_sym, columns: Sequel.lit(headers.join(",")),
	data: File.read(csv_file),
	options: "FORMAT csv, HEADER TRUE")
	rescue StandardError => e
	puts "ERROR parsing #{@folder}"
	raise e
	end
	end
	end

	class Generator

	def initialize(db)
	@db = db
	end

	def run!(sources_folder)

	empty_database!
	puts "Archiving content...."
	load_data!(sources_folder)
	end

	def empty_database!
	GTFS_TABLES.each { \|table\|
	@db.run "TRUNCATE #{table};"
	}
	end

	def load_data!(region_folder)
	raise "Please inform an source path" if region_folder.empty?

	FileUtils.mkdir_p "tmp"

	drop_temp_tables
	create_temp_tables

	Dir["#{region_folder}/*"].each do \|folder\|
	load_region(folder)
	end

	drop_temp_tables
	puts "Done."
	end

	private

	def create_temp_tables
	GTFS_TABLES.each do \|table\|
	temp_table = "#{table}_temp"
	@db.run "CREATE TABLE #{temp_table} (LIKE #{table} INCLUDING ALL)"
	end
	end

	def drop_temp_tables
	GTFS_TABLES.each do \|table\|
	temp_table = "#{table}_temp"
	@db.run "DROP TABLE IF EXISTS #{temp_table}"
	end
	end

	def truncate_temp_tables
	GTFS_TABLES.each do \|table\|
	temp_table = "#{table}_temp"
	@db.run "TRUNCATE #{temp_table}"
	end
	end

	def load_region(region)
	region_name = region.split("/").last
	puts region_name
	Dir["#{region}/*"].each do \|dataset\|
	insert_data(dataset)
	end
	end


	# Runs once for every zip file inside a region name
	def insert_data(folder)
	puts "\t #{folder}"
	loader = DataLoader.new(@db, folder)
	agency = CSV.open(folder + "/agency.txt", "r:bom\|utf-8", headers: true, &:first)
	agency_id = agency["agency_id"] \|\| agency["agency_name"][0,3]


	table = "agency"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "routes"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	# Guarantees that agency id is set in routes
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET route_id='#{agency_id}:' \|\| route_id
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "stops"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	# Update stops ids appending agency_id
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET stop_id='#{agency_id}:' \|\| stop_id, parent_station='#{agency_id}:' \|\| parent_station
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "trips"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	# Update trips ids appending agency_id
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET trip_id='#{agency_id}:' \|\| trip_id,
	route_id='#{agency_id}:' \|\| route_id,
	shape_id='#{agency_id}:' \|\| shape_id
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	# table = "stop_times"
	# temp_table = temp_table = "#{table}_temp"
	# loader.load_csv(table)
	# # Update stop_times ids appending agency_id
	# @db.run <<-SQL
	# UPDATE #{temp_table}
	# SET trip_id='#{agency_id}:' \|\| trip_id,
	# stop_id='#{agency_id}:' \|\| stop_id
	# SQL
	# @db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "calendar"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET service_id='#{agency_id}:' \|\| service_id
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "calendar_dates"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET service_id='#{agency_id}:' \|\| service_id
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "shapes"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET shape_id='#{agency_id}:' \|\| shape_id
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"


	table = "transfers"
	temp_table = temp_table = "#{table}_temp"
	loader.load_csv(table)
	@db.run <<-SQL
	UPDATE #{temp_table}
	SET from_stop_id='#{agency_id}:' \|\| from_stop_id,
	to_stop_id='#{agency_id}:' \|\| to_stop_id
	SQL
	@db.run "INSERT INTO #{table} SELECT * FROM #{temp_table}"

	truncate_temp_tables
	end
	end

	Generator.new(Sequel.connect("postgres://localhost/archive")).run!(ARGV[0])
	DROP TABLE AGENCY;
	DROP TABLE ROUTES;
	DROP TABLE STOPS;
	DROP TABLE TRIPS;

	DROP TABLE STOP_TIMES;
	DROP TABLE SHAPES;
	DROP TABLE CALENDAR;
	DROP TABLE CALENDAR_DATES;
	DROP TABLE TRANSFERS;


	CREATE TABLE AGENCY
	(
	agency_id VARCHAR(50) UNIQUE,
	agency_name VARCHAR(80),
	agency_url VARCHAR(100),
	agency_timezone VARCHAR(50),
	agency_lang VARCHAR(2),
	agency_fare_url VARCHAR(100),
	agency_phone VARCHAR(20)
	);


	CREATE TABLE ROUTES
	(
	route_id VARCHAR(100) PRIMARY KEY,
	agency_id VARCHAR(50),
	route_short_name VARCHAR(20),
	route_long_name VARCHAR(100),
	route_desc VARCHAR(100),
	route_type NUMERIC(4),
	route_url VARCHAR(100),
	route_color VARCHAR(8),
	route_text_color VARCHAR(8)
	);


	CREATE TABLE STOPS
	(
	stop_id VARCHAR(100) PRIMARY KEY,
	stop_code VARCHAR(10),
	stop_name VARCHAR(100),
	stop_desc VARCHAR(100),
	stop_lat NUMERIC(38,8),
	stop_lon NUMERIC(38,8),
	zone_id VARCHAR(5),
	stop_url VARCHAR(100),
	location_type NUMERIC(2),
	stop_timezone VARCHAR(50),
	parent_station VARCHAR(100),
	wheelchair_boarding NUMERIC(1)
	);

	CREATE TABLE TRIPS
	(
	trip_id VARCHAR(100) PRIMARY KEY,
	route_id VARCHAR(100),
	service_id VARCHAR(50),
	trip_headsign VARCHAR(50),
	trip_short_name VARCHAR(30),
	direction_id NUMERIC(2),
	block_id NUMERIC(10),
	shape_id VARCHAR(100),
	wheelchair_accessible NUMERIC(1),
	bikes_allowed NUMERIC(1)
	);


	CREATE TABLE STOP_TIMES
	(
	trip_id VARCHAR(50),
	arrival_time VARCHAR(10),
	departure_time VARCHAR(10),
	stop_id VARCHAR(100),
	stop_sequence NUMERIC(10),
	stop_headsign VARCHAR(30),
	pickup_type VARCHAR(100),
	drop_off_type VARCHAR(100),
	shape_dist_traveled NUMERIC,
	timepoint NUMERIC(1)
	);


	CREATE TABLE SHAPES
	(
	shape_id VARCHAR(100),
	shape_pt_lat NUMERIC,
	shape_pt_lon NUMERIC,
	shape_pt_sequence NUMERIC(6),
	shape_dist_traveled NUMERIC
	);


	CREATE TABLE CALENDAR_DATES
	(
	service_id VARCHAR(100),
	date DATE,
	exception_type VARCHAR(10)
	);


	CREATE TABLE CALENDAR
	(
	service_id VARCHAR(100),
	monday NUMERIC(1),
	tuesday NUMERIC(1),
	wednesday NUMERIC(1),
	thursday NUMERIC(1),
	friday NUMERIC(1),
	saturday NUMERIC(1),
	sunday NUMERIC(1),
	start_date DATE,
	end_date DATE
	);


	CREATE TABLE TRANSFERS
	(
	from_stop_id VARCHAR(100),
	to_stop_id VARCHAR(100),
	transfer_type NUMERIC(3),
	min_transfer_time NUMERIC(6)
	);



	-- Guarantee agency_id is set
	DROP FUNCTION IF EXISTS update_agency_id();
	CREATE FUNCTION update_agency_id() RETURNS trigger AS $$
	BEGIN
	NEW.agency_id := COALESCE(NEW.agency_id, substring(NEW.agency_name from 1 for 3));
	RETURN NEW;
	END;
	$$ LANGUAGE plpgsql;
	CREATE TRIGGER agency_id_trigger
	BEFORE INSERT OR UPDATE
	ON agency
	FOR EACH ROW
	EXECUTE PROCEDURE update_agency_id();
	source "https://rubygems.org"

	ruby "2.3.0"

	gem "sequel"
	gem "pg"
	gem "rubyzip"

	group :development, :test do
	gem "rake"
	gem "pry-byebug"
	end