Skip to content

Instantly share code, notes, and snippets.

@blairanderson
Created September 7, 2025 21:11
Show Gist options
  • Select an option

  • Save blairanderson/d590b044d98bb90ec980ba4611e1aef0 to your computer and use it in GitHub Desktop.

Select an option

Save blairanderson/d590b044d98bb90ec980ba4611e1aef0 to your computer and use it in GitHub Desktop.
#!/usr/bin/env ruby
# csv_to_schema.rb
# Usage: ruby csv_to_schema.rb path/to/file.csv table_name
# Prints a suggested Rails migration and an example seeds.rb importer (Postgres COPY).
require "csv"
require "time"
require "date"
abort("Usage: ruby #{__FILE__} path/to/file.csv table_name") unless ARGV.size == 2
csv_path, table_name = ARGV
abort("CSV not found: #{csv_path}") unless File.file?(csv_path)
UUID_RE = /\A[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\z/i
INT_RE = /\A[+-]?\d+\z/
NUM_RE = /\A[+-]?(?:\d+)(?:\.\d+)?(?:[eE][+-]?\d+)?\z/
BOOLS = {
"true"=>true, "t"=>true, "1"=>true, "yes"=>true, "y"=>true,
"false"=>false, "f"=>false, "0"=>false, "no"=>false, "n"=>false
}
def norm(h)
s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
s = s.gsub("%", " pct ").downcase.strip
s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+|_+\z/, "").gsub(/_{2,}/, "_")
s.empty? ? "col" : s
end
def unique_headers(raw_headers)
base = raw_headers.map { |h| norm(h) }
seen = Hash.new(0)
base.map do |h|
seen[h] += 1
seen[h] == 1 ? h : "#{h}_#{seen[h]-1}"
end
end
def parse_dateish(str)
s = str.to_s.strip
return nil if s.empty?
Time.iso8601(s); :datetime
rescue
begin
Date.iso8601(s); :date
rescue
begin
t = Time.parse(s)
s =~ /\d{1,2}:\d{2}/ ? :datetime : :date
rescue
nil
end
end
end
def sci?(s) = s.to_s.strip.match?(/[eE]/)
Stat = Struct.new(:name, :raw, :values, :max_len, :bool, :int_ok, :num_ok, :sci, :uuid, :min_i, :max_i, :saw_date, :saw_dt, :decimals) do
def initialize(name, raw)
super(name, raw, 0, 0, true, true, true, false, true, nil, nil, true, true, [])
end
end
enum = CSV.foreach(csv_path, headers: true, return_headers: true)
headers = []
stats = []
row_counter = 0
enum.each do |row|
if row.header_row?
raw_headers = row.headers || []
abort("CSV appears to have no headers.") if raw_headers.empty?
headers = unique_headers(raw_headers)
stats = headers.map.with_index { |h, i| Stat.new(h, raw_headers[i]) }
next
end
stats.each_with_index do |st, i|
v = row[i]
sv = v.to_s
st.values += 1
st.max_len = [ st.max_len, sv.length ].max
next if sv.strip.empty?
st.bool &&= BOOLS.key?(sv.strip.downcase)
st.uuid &&= UUID_RE.match?(sv)
if INT_RE.match?(sv)
ii = sv.to_i
st.min_i = st.min_i ? [ st.min_i, ii ].min : ii
st.max_i = st.max_i ? [ st.max_i, ii ].max : ii
else
st.int_ok = false
end
if NUM_RE.match?(sv)
st.sci ||= sci?(sv)
st.decimals << sv
else
st.num_ok = false
end
kind = parse_dateish(sv)
if kind.nil?
st.saw_date = false
st.saw_dt = false
else
st.saw_date &&= (kind == :date || kind == :datetime)
st.saw_dt &&= (kind == :datetime)
end
end
row_counter += 1
end
abort("No data rows found.") if stats.empty?
def est_precision_scale(strings)
max_i = 1
max_f = 0
strings.each do |s|
return nil if s.downcase.include?("e")
a, b = s.split(".", 2)
a = a.to_s.gsub(/[^\d-]/, "").sub(/\A-/, "")
b = b.to_s.gsub(/[^\d]/, "")
max_i = [ max_i, a.length ].max
max_f = [ max_f, b.length ].max
end
precision = [ [ max_i + max_f, 1 ].max, 38 ].min
[ precision, max_f ]
end
def infer_type(st)
return [ :boolean, {} ] if st.bool && st.values > 0
return [ :string, {} ] if st.uuid
if st.int_ok && st.min_i && st.max_i
return (st.min_i < -2_147_483_648 || st.max_i > 2_147_483_647) ? [ :bigint, {} ] : [ :integer, {} ]
end
if st.num_ok
if st.sci
return [ :float, {} ]
else
ps = est_precision_scale(st.decimals)
return ps ? [ :decimal, { precision: ps[0], scale: ps[1] } ] : [ :float, {} ]
end
end
return [ :datetime, {} ] if st.saw_dt && st.values > 0
return [ :date, {} ] if st.saw_date && st.values > 0
st.max_len <= 255 ? [ :string, {} ] : [ :text, {} ]
end
inferred = stats.map { |st| [ st.name, *infer_type(st) ] }
has_created = inferred.any? { |(n, _, _)| n == "created_at" }
has_updated = inferred.any? { |(n, _, _)| n == "updated_at" }
add_timestamps = has_created && has_updated
def class_name_for(table)
"Create#{table.to_s.split('_').map(&:capitalize).join}FromCsv"
end
def emit_column_line(name, type_sym, opts)
opt = opts && !opts.empty? ? " " + opts.map { |k, v| "#{k}: #{v.is_a?(String) ? %Q(\"#{v}\") : v}" }.join(", ") : ""
" t.#{type_sym} :#{name}#{opt}"
end
migration_class = class_name_for(table_name)
migration_body = +"class #{migration_class} < ActiveRecord::Migration[7.0]\n def change\n"
migration_body << " create_table :#{table_name} do |t|\n"
inferred.each { |name, type_sym, opts| migration_body << emit_column_line(name, type_sym, opts) << "\n" }
migration_body << " t.timestamps\n" if add_timestamps
migration_body << " end\n end\nend\n"
seed_script = <<~'RUBY'
# db/seeds.rb — example CSV importer
# Fast bulk load using PostgreSQL COPY via ActiveRecord's raw_connection.
require "csv"
def norm(h)
s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
s = s.gsub("%"," pct ").downcase.strip
s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+|_+\z/,"").gsub(/_{2,}/,"_")
s.empty? ? "col" : s
end
def unique_headers(raw_headers)
seen = Hash.new(0)
raw_headers.map do |h|
n = norm(h)
seen[n] += 1
seen[n] == 1 ? n : "#{n}_#{seen[n]-1}"
end
end
csv_path = "REPLACE_WITH_CSV_PATH"
table = :REPLACE_WITH_TABLE
raw = File.read(csv_path)
parsed = CSV.parse(raw, headers: true)
abort("CSV has no headers") unless parsed.headers && !parsed.headers.empty?
norm_headers = unique_headers(parsed.headers)
# Regenerate CSV with normalized headers that match the migration
normalized_csv = CSV.generate do |csv|
csv << norm_headers
parsed.each { |row| csv << row.fields }
end
conn = ActiveRecord::Base.connection
conn.transaction do
# Optional: clear existing data first
# conn.execute("TRUNCATE TABLE #{table} RESTART IDENTITY CASCADE")
raw_conn = conn.raw_connection
cols = norm_headers.map { |c| '"' + c + '"' }.join(", ")
raw_conn.copy_data("COPY #{table} (#{cols}) FROM STDIN CSV HEADER") do
raw_conn.put_copy_data(normalized_csv)
end
end
puts "Imported #{parsed.size} rows into #{table}"
RUBY
puts "# ===== Suggested Migration (copy into db/migrate/*_#{migration_class.gsub(/([a-z])([A-Z])/, '\\1_\\2').downcase}.rb) ====="
puts migration_body
puts "# ===== Example Seeds Importer (paste into db/seeds.rb) ====="
puts seed_script
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment