blairanderson · September 7, 2025 21:11
diff --git a/csv_to_schema.rb b/csv_to_schema.rb
 #!/usr/bin/env ruby
 # csv_to_schema.rb
 # Usage: ruby csv_to_schema.rb path/to/file.csv table_name
 # Prints a suggested Rails migration and an example seeds.rb importer (Postgres COPY).

 require "csv"
 require "time"
 require "date"

 abort("Usage: ruby #{__FILE__} path/to/file.csv table_name") unless ARGV.size == 2
 csv_path, table_name = ARGV
 abort("CSV not found: #{csv_path}") unless File.file?(csv_path)

 UUID_RE = /\A[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\z/i
 INT_RE  = /\A[+-]?\d+\z/
 NUM_RE  = /\A[+-]?(?:\d+)(?:\.\d+)?(?:[eE][+-]?\d+)?\z/
 BOOLS = {
  "true"=>true, "t"=>true, "1"=>true, "yes"=>true, "y"=>true,
  "false"=>false, "f"=>false, "0"=>false, "no"=>false, "n"=>false
 }

 def norm(h)
  s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
  s = s.gsub("%", " pct ").downcase.strip
  s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+|_+\z/, "").gsub(/_{2,}/, "_")
  s.empty? ? "col" : s
 end

 def unique_headers(raw_headers)
  base = raw_headers.map { |h| norm(h) }
  seen = Hash.new(0)
  base.map do |h|
    seen[h] += 1
    seen[h] == 1 ? h : "#{h}_#{seen[h]-1}"
  end
 end

 def parse_dateish(str)
  s = str.to_s.strip
  return nil if s.empty?
  Time.iso8601(s); :datetime
 rescue
  begin
    Date.iso8601(s); :date
  rescue
    begin
      t = Time.parse(s)
      s =~ /\d{1,2}:\d{2}/ ? :datetime : :date
    rescue
      nil
    end
  end
 end

 def sci?(s) = s.to_s.strip.match?(/[eE]/)

 Stat = Struct.new(:name, :raw, :values, :max_len, :bool, :int_ok, :num_ok, :sci, :uuid, :min_i, :max_i, :saw_date, :saw_dt, :decimals) do
  def initialize(name, raw)
    super(name, raw, 0, 0, true, true, true, false, true, nil, nil, true, true, [])
  end
 end

 enum = CSV.foreach(csv_path, headers: true, return_headers: true)
 headers = []
 stats = []
 row_counter = 0

 enum.each do |row|
  if row.header_row?
    raw_headers = row.headers || []
    abort("CSV appears to have no headers.") if raw_headers.empty?
    headers = unique_headers(raw_headers)
    stats = headers.map.with_index { |h, i| Stat.new(h, raw_headers[i]) }
    next
  end

  stats.each_with_index do |st, i|
    v = row[i]
    sv = v.to_s
    st.values += 1
    st.max_len = [ st.max_len, sv.length ].max
    next if sv.strip.empty?

    st.bool &&= BOOLS.key?(sv.strip.downcase)
    st.uuid &&= UUID_RE.match?(sv)

    if INT_RE.match?(sv)
      ii = sv.to_i
      st.min_i = st.min_i ? [ st.min_i, ii ].min : ii
      st.max_i = st.max_i ? [ st.max_i, ii ].max : ii
    else
      st.int_ok = false
    end

    if NUM_RE.match?(sv)
      st.sci ||= sci?(sv)
      st.decimals << sv
    else
      st.num_ok = false
    end

    kind = parse_dateish(sv)
    if kind.nil?
      st.saw_date = false
      st.saw_dt   = false
    else
      st.saw_date &&= (kind == :date || kind == :datetime)
      st.saw_dt   &&= (kind == :datetime)
    end
  end
  row_counter += 1
 end

 abort("No data rows found.") if stats.empty?

 def est_precision_scale(strings)
  max_i = 1
  max_f = 0
  strings.each do |s|
    return nil if s.downcase.include?("e")
    a, b = s.split(".", 2)
    a = a.to_s.gsub(/[^\d-]/, "").sub(/\A-/, "")
    b = b.to_s.gsub(/[^\d]/, "")
    max_i = [ max_i, a.length ].max
    max_f = [ max_f, b.length ].max
  end
  precision = [ [ max_i + max_f, 1 ].max, 38 ].min
  [ precision, max_f ]
 end

 def infer_type(st)
  return [ :boolean, {} ] if st.bool && st.values > 0
  return [ :string,  {} ] if st.uuid
  if st.int_ok && st.min_i && st.max_i
    return (st.min_i < -2_147_483_648 || st.max_i > 2_147_483_647) ? [ :bigint, {} ] : [ :integer, {} ]
  end
  if st.num_ok
    if st.sci
      return [ :float, {} ]
    else
      ps = est_precision_scale(st.decimals)
      return ps ? [ :decimal, { precision: ps[0], scale: ps[1] } ] : [ :float, {} ]
    end
  end
  return [ :datetime, {} ] if st.saw_dt && st.values > 0
  return [ :date,     {} ] if st.saw_date && st.values > 0
  st.max_len <= 255 ? [ :string, {} ] : [ :text, {} ]
 end

 inferred = stats.map { |st| [ st.name, *infer_type(st) ] }

 has_created = inferred.any? { |(n, _, _)| n == "created_at" }
 has_updated = inferred.any? { |(n, _, _)| n == "updated_at" }
 add_timestamps = has_created && has_updated

 def class_name_for(table)
  "Create#{table.to_s.split('_').map(&:capitalize).join}FromCsv"
 end

 def emit_column_line(name, type_sym, opts)
  opt = opts && !opts.empty? ? " " + opts.map { |k, v| "#{k}: #{v.is_a?(String) ? %Q(\"#{v}\") : v}" }.join(", ") : ""
  "      t.#{type_sym} :#{name}#{opt}"
 end

 migration_class = class_name_for(table_name)
 migration_body = +"class #{migration_class} < ActiveRecord::Migration[7.0]\n  def change\n"
 migration_body << "    create_table :#{table_name} do |t|\n"
 inferred.each { |name, type_sym, opts| migration_body << emit_column_line(name, type_sym, opts) << "\n" }
 migration_body << "      t.timestamps\n" if add_timestamps
 migration_body << "    end\n  end\nend\n"

 seed_script = <<~'RUBY'
  # db/seeds.rb — example CSV importer
  # Fast bulk load using PostgreSQL COPY via ActiveRecord's raw_connection.

  require "csv"

  def norm(h)
    s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
    s = s.gsub("%"," pct ").downcase.strip
    s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+|_+\z/,"").gsub(/_{2,}/,"_")
    s.empty? ? "col" : s
  end

  def unique_headers(raw_headers)
    seen = Hash.new(0)
    raw_headers.map do |h|
      n = norm(h)
      seen[n] += 1
      seen[n] == 1 ? n : "#{n}_#{seen[n]-1}"
    end
  end

  csv_path = "REPLACE_WITH_CSV_PATH"
  table    = :REPLACE_WITH_TABLE

  raw = File.read(csv_path)
  parsed = CSV.parse(raw, headers: true)
  abort("CSV has no headers") unless parsed.headers && !parsed.headers.empty?

  norm_headers = unique_headers(parsed.headers)

  # Regenerate CSV with normalized headers that match the migration
  normalized_csv = CSV.generate do |csv|
    csv << norm_headers
    parsed.each { |row| csv << row.fields }
  end

  conn = ActiveRecord::Base.connection
  conn.transaction do
    # Optional: clear existing data first
    # conn.execute("TRUNCATE TABLE #{table} RESTART IDENTITY CASCADE")
    raw_conn = conn.raw_connection
    cols = norm_headers.map { |c| '"' + c + '"' }.join(", ")
    raw_conn.copy_data("COPY #{table} (#{cols}) FROM STDIN CSV HEADER") do
      raw_conn.put_copy_data(normalized_csv)
    end
  end

  puts "Imported #{parsed.size} rows into #{table}"
 RUBY

 puts "# ===== Suggested Migration (copy into db/migrate/*_#{migration_class.gsub(/([a-z])([A-Z])/, '\\1_\\2').downcase}.rb) ====="
 puts migration_body
 puts "# ===== Example Seeds Importer (paste into db/seeds.rb) ====="
 puts seed_script
	#!/usr/bin/env ruby
	# csv_to_schema.rb
	# Usage: ruby csv_to_schema.rb path/to/file.csv table_name
	# Prints a suggested Rails migration and an example seeds.rb importer (Postgres COPY).

	require "csv"
	require "time"
	require "date"

	abort("Usage: ruby #{__FILE__} path/to/file.csv table_name") unless ARGV.size == 2
	csv_path, table_name = ARGV
	abort("CSV not found: #{csv_path}") unless File.file?(csv_path)

	UUID_RE = /\A[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\z/i
	INT_RE = /\A[+-]?\d+\z/
	NUM_RE = /\A[+-]?(?:\d+)(?:\.\d+)?(?:[eE][+-]?\d+)?\z/
	BOOLS = {
	"true"=>true, "t"=>true, "1"=>true, "yes"=>true, "y"=>true,
	"false"=>false, "f"=>false, "0"=>false, "no"=>false, "n"=>false
	}

	def norm(h)
	s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
	s = s.gsub("%", " pct ").downcase.strip
	s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+\|_+\z/, "").gsub(/_{2,}/, "_")
	s.empty? ? "col" : s
	end

	def unique_headers(raw_headers)
	base = raw_headers.map { \|h\| norm(h) }
	seen = Hash.new(0)
	base.map do \|h\|
	seen[h] += 1
	seen[h] == 1 ? h : "#{h}_#{seen[h]-1}"
	end
	end

	def parse_dateish(str)
	s = str.to_s.strip
	return nil if s.empty?
	Time.iso8601(s); :datetime
	rescue
	begin
	Date.iso8601(s); :date
	rescue
	begin
	t = Time.parse(s)
	s =~ /\d{1,2}:\d{2}/ ? :datetime : :date
	rescue
	nil
	end
	end
	end

	def sci?(s) = s.to_s.strip.match?(/[eE]/)

	Stat = Struct.new(:name, :raw, :values, :max_len, :bool, :int_ok, :num_ok, :sci, :uuid, :min_i, :max_i, :saw_date, :saw_dt, :decimals) do
	def initialize(name, raw)
	super(name, raw, 0, 0, true, true, true, false, true, nil, nil, true, true, [])
	end
	end

	enum = CSV.foreach(csv_path, headers: true, return_headers: true)
	headers = []
	stats = []
	row_counter = 0

	enum.each do \|row\|
	if row.header_row?
	raw_headers = row.headers \|\| []
	abort("CSV appears to have no headers.") if raw_headers.empty?
	headers = unique_headers(raw_headers)
	stats = headers.map.with_index { \|h, i\| Stat.new(h, raw_headers[i]) }
	next
	end

	stats.each_with_index do \|st, i\|
	v = row[i]
	sv = v.to_s
	st.values += 1
	st.max_len = [ st.max_len, sv.length ].max
	next if sv.strip.empty?

	st.bool &&= BOOLS.key?(sv.strip.downcase)
	st.uuid &&= UUID_RE.match?(sv)

	if INT_RE.match?(sv)
	ii = sv.to_i
	st.min_i = st.min_i ? [ st.min_i, ii ].min : ii
	st.max_i = st.max_i ? [ st.max_i, ii ].max : ii
	else
	st.int_ok = false
	end

	if NUM_RE.match?(sv)
	st.sci \|\|= sci?(sv)
	st.decimals << sv
	else
	st.num_ok = false
	end

	kind = parse_dateish(sv)
	if kind.nil?
	st.saw_date = false
	st.saw_dt = false
	else
	st.saw_date &&= (kind == :date \|\| kind == :datetime)
	st.saw_dt &&= (kind == :datetime)
	end
	end
	row_counter += 1
	end

	abort("No data rows found.") if stats.empty?

	def est_precision_scale(strings)
	max_i = 1
	max_f = 0
	strings.each do \|s\|
	return nil if s.downcase.include?("e")
	a, b = s.split(".", 2)
	a = a.to_s.gsub(/[^\d-]/, "").sub(/\A-/, "")
	b = b.to_s.gsub(/[^\d]/, "")
	max_i = [ max_i, a.length ].max
	max_f = [ max_f, b.length ].max
	end
	precision = [ [ max_i + max_f, 1 ].max, 38 ].min
	[ precision, max_f ]
	end

	def infer_type(st)
	return [ :boolean, {} ] if st.bool && st.values > 0
	return [ :string, {} ] if st.uuid
	if st.int_ok && st.min_i && st.max_i
	return (st.min_i < -2_147_483_648 \|\| st.max_i > 2_147_483_647) ? [ :bigint, {} ] : [ :integer, {} ]
	end
	if st.num_ok
	if st.sci
	return [ :float, {} ]
	else
	ps = est_precision_scale(st.decimals)
	return ps ? [ :decimal, { precision: ps[0], scale: ps[1] } ] : [ :float, {} ]
	end
	end
	return [ :datetime, {} ] if st.saw_dt && st.values > 0
	return [ :date, {} ] if st.saw_date && st.values > 0
	st.max_len <= 255 ? [ :string, {} ] : [ :text, {} ]
	end

	inferred = stats.map { \|st\| [ st.name, *infer_type(st) ] }

	has_created = inferred.any? { \|(n, _, _)\| n == "created_at" }
	has_updated = inferred.any? { \|(n, _, _)\| n == "updated_at" }
	add_timestamps = has_created && has_updated

	def class_name_for(table)
	"Create#{table.to_s.split('_').map(&:capitalize).join}FromCsv"
	end

	def emit_column_line(name, type_sym, opts)
	opt = opts && !opts.empty? ? " " + opts.map { \|k, v\| "#{k}: #{v.is_a?(String) ? %Q(\"#{v}\") : v}" }.join(", ") : ""
	" t.#{type_sym} :#{name}#{opt}"
	end

	migration_class = class_name_for(table_name)
	migration_body = +"class #{migration_class} < ActiveRecord::Migration[7.0]\n def change\n"
	migration_body << " create_table :#{table_name} do \|t\|\n"
	inferred.each { \|name, type_sym, opts\| migration_body << emit_column_line(name, type_sym, opts) << "\n" }
	migration_body << " t.timestamps\n" if add_timestamps
	migration_body << " end\n end\nend\n"

	seed_script = <<~'RUBY'
	# db/seeds.rb — example CSV importer
	# Fast bulk load using PostgreSQL COPY via ActiveRecord's raw_connection.

	require "csv"

	def norm(h)
	s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "")
	s = s.gsub("%"," pct ").downcase.strip
	s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+\|_+\z/,"").gsub(/_{2,}/,"_")
	s.empty? ? "col" : s
	end

	def unique_headers(raw_headers)
	seen = Hash.new(0)
	raw_headers.map do \|h\|
	n = norm(h)
	seen[n] += 1
	seen[n] == 1 ? n : "#{n}_#{seen[n]-1}"
	end
	end

	csv_path = "REPLACE_WITH_CSV_PATH"
	table = :REPLACE_WITH_TABLE

	raw = File.read(csv_path)
	parsed = CSV.parse(raw, headers: true)
	abort("CSV has no headers") unless parsed.headers && !parsed.headers.empty?

	norm_headers = unique_headers(parsed.headers)

	# Regenerate CSV with normalized headers that match the migration
	normalized_csv = CSV.generate do \|csv\|
	csv << norm_headers
	parsed.each { \|row\| csv << row.fields }
	end

	conn = ActiveRecord::Base.connection
	conn.transaction do
	# Optional: clear existing data first
	# conn.execute("TRUNCATE TABLE #{table} RESTART IDENTITY CASCADE")
	raw_conn = conn.raw_connection
	cols = norm_headers.map { \|c\| '"' + c + '"' }.join(", ")
	raw_conn.copy_data("COPY #{table} (#{cols}) FROM STDIN CSV HEADER") do
	raw_conn.put_copy_data(normalized_csv)
	end
	end

	puts "Imported #{parsed.size} rows into #{table}"
	RUBY

	puts "# ===== Suggested Migration (copy into db/migrate/*_#{migration_class.gsub(/([a-z])([A-Z])/, '\\1_\\2').downcase}.rb) ====="
	puts migration_body
	puts "# ===== Example Seeds Importer (paste into db/seeds.rb) ====="
	puts seed_script
No results found