Created
September 7, 2025 21:11
-
-
Save blairanderson/d590b044d98bb90ec980ba4611e1aef0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env ruby | |
| # csv_to_schema.rb | |
| # Usage: ruby csv_to_schema.rb path/to/file.csv table_name | |
| # Prints a suggested Rails migration and an example seeds.rb importer (Postgres COPY). | |
| require "csv" | |
| require "time" | |
| require "date" | |
| abort("Usage: ruby #{__FILE__} path/to/file.csv table_name") unless ARGV.size == 2 | |
| csv_path, table_name = ARGV | |
| abort("CSV not found: #{csv_path}") unless File.file?(csv_path) | |
| UUID_RE = /\A[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\z/i | |
| INT_RE = /\A[+-]?\d+\z/ | |
| NUM_RE = /\A[+-]?(?:\d+)(?:\.\d+)?(?:[eE][+-]?\d+)?\z/ | |
| BOOLS = { | |
| "true"=>true, "t"=>true, "1"=>true, "yes"=>true, "y"=>true, | |
| "false"=>false, "f"=>false, "0"=>false, "no"=>false, "n"=>false | |
| } | |
| def norm(h) | |
| s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") | |
| s = s.gsub("%", " pct ").downcase.strip | |
| s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+|_+\z/, "").gsub(/_{2,}/, "_") | |
| s.empty? ? "col" : s | |
| end | |
| def unique_headers(raw_headers) | |
| base = raw_headers.map { |h| norm(h) } | |
| seen = Hash.new(0) | |
| base.map do |h| | |
| seen[h] += 1 | |
| seen[h] == 1 ? h : "#{h}_#{seen[h]-1}" | |
| end | |
| end | |
| def parse_dateish(str) | |
| s = str.to_s.strip | |
| return nil if s.empty? | |
| Time.iso8601(s); :datetime | |
| rescue | |
| begin | |
| Date.iso8601(s); :date | |
| rescue | |
| begin | |
| t = Time.parse(s) | |
| s =~ /\d{1,2}:\d{2}/ ? :datetime : :date | |
| rescue | |
| nil | |
| end | |
| end | |
| end | |
| def sci?(s) = s.to_s.strip.match?(/[eE]/) | |
| Stat = Struct.new(:name, :raw, :values, :max_len, :bool, :int_ok, :num_ok, :sci, :uuid, :min_i, :max_i, :saw_date, :saw_dt, :decimals) do | |
| def initialize(name, raw) | |
| super(name, raw, 0, 0, true, true, true, false, true, nil, nil, true, true, []) | |
| end | |
| end | |
| enum = CSV.foreach(csv_path, headers: true, return_headers: true) | |
| headers = [] | |
| stats = [] | |
| row_counter = 0 | |
| enum.each do |row| | |
| if row.header_row? | |
| raw_headers = row.headers || [] | |
| abort("CSV appears to have no headers.") if raw_headers.empty? | |
| headers = unique_headers(raw_headers) | |
| stats = headers.map.with_index { |h, i| Stat.new(h, raw_headers[i]) } | |
| next | |
| end | |
| stats.each_with_index do |st, i| | |
| v = row[i] | |
| sv = v.to_s | |
| st.values += 1 | |
| st.max_len = [ st.max_len, sv.length ].max | |
| next if sv.strip.empty? | |
| st.bool &&= BOOLS.key?(sv.strip.downcase) | |
| st.uuid &&= UUID_RE.match?(sv) | |
| if INT_RE.match?(sv) | |
| ii = sv.to_i | |
| st.min_i = st.min_i ? [ st.min_i, ii ].min : ii | |
| st.max_i = st.max_i ? [ st.max_i, ii ].max : ii | |
| else | |
| st.int_ok = false | |
| end | |
| if NUM_RE.match?(sv) | |
| st.sci ||= sci?(sv) | |
| st.decimals << sv | |
| else | |
| st.num_ok = false | |
| end | |
| kind = parse_dateish(sv) | |
| if kind.nil? | |
| st.saw_date = false | |
| st.saw_dt = false | |
| else | |
| st.saw_date &&= (kind == :date || kind == :datetime) | |
| st.saw_dt &&= (kind == :datetime) | |
| end | |
| end | |
| row_counter += 1 | |
| end | |
| abort("No data rows found.") if stats.empty? | |
| def est_precision_scale(strings) | |
| max_i = 1 | |
| max_f = 0 | |
| strings.each do |s| | |
| return nil if s.downcase.include?("e") | |
| a, b = s.split(".", 2) | |
| a = a.to_s.gsub(/[^\d-]/, "").sub(/\A-/, "") | |
| b = b.to_s.gsub(/[^\d]/, "") | |
| max_i = [ max_i, a.length ].max | |
| max_f = [ max_f, b.length ].max | |
| end | |
| precision = [ [ max_i + max_f, 1 ].max, 38 ].min | |
| [ precision, max_f ] | |
| end | |
| def infer_type(st) | |
| return [ :boolean, {} ] if st.bool && st.values > 0 | |
| return [ :string, {} ] if st.uuid | |
| if st.int_ok && st.min_i && st.max_i | |
| return (st.min_i < -2_147_483_648 || st.max_i > 2_147_483_647) ? [ :bigint, {} ] : [ :integer, {} ] | |
| end | |
| if st.num_ok | |
| if st.sci | |
| return [ :float, {} ] | |
| else | |
| ps = est_precision_scale(st.decimals) | |
| return ps ? [ :decimal, { precision: ps[0], scale: ps[1] } ] : [ :float, {} ] | |
| end | |
| end | |
| return [ :datetime, {} ] if st.saw_dt && st.values > 0 | |
| return [ :date, {} ] if st.saw_date && st.values > 0 | |
| st.max_len <= 255 ? [ :string, {} ] : [ :text, {} ] | |
| end | |
| inferred = stats.map { |st| [ st.name, *infer_type(st) ] } | |
| has_created = inferred.any? { |(n, _, _)| n == "created_at" } | |
| has_updated = inferred.any? { |(n, _, _)| n == "updated_at" } | |
| add_timestamps = has_created && has_updated | |
| def class_name_for(table) | |
| "Create#{table.to_s.split('_').map(&:capitalize).join}FromCsv" | |
| end | |
| def emit_column_line(name, type_sym, opts) | |
| opt = opts && !opts.empty? ? " " + opts.map { |k, v| "#{k}: #{v.is_a?(String) ? %Q(\"#{v}\") : v}" }.join(", ") : "" | |
| " t.#{type_sym} :#{name}#{opt}" | |
| end | |
| migration_class = class_name_for(table_name) | |
| migration_body = +"class #{migration_class} < ActiveRecord::Migration[7.0]\n def change\n" | |
| migration_body << " create_table :#{table_name} do |t|\n" | |
| inferred.each { |name, type_sym, opts| migration_body << emit_column_line(name, type_sym, opts) << "\n" } | |
| migration_body << " t.timestamps\n" if add_timestamps | |
| migration_body << " end\n end\nend\n" | |
| seed_script = <<~'RUBY' | |
| # db/seeds.rb — example CSV importer | |
| # Fast bulk load using PostgreSQL COPY via ActiveRecord's raw_connection. | |
| require "csv" | |
| def norm(h) | |
| s = h.to_s.encode("UTF-8", invalid: :replace, undef: :replace, replace: "") | |
| s = s.gsub("%"," pct ").downcase.strip | |
| s = s.gsub(/[^\p{Alnum}]+/, "_").gsub(/\A_+|_+\z/,"").gsub(/_{2,}/,"_") | |
| s.empty? ? "col" : s | |
| end | |
| def unique_headers(raw_headers) | |
| seen = Hash.new(0) | |
| raw_headers.map do |h| | |
| n = norm(h) | |
| seen[n] += 1 | |
| seen[n] == 1 ? n : "#{n}_#{seen[n]-1}" | |
| end | |
| end | |
| csv_path = "REPLACE_WITH_CSV_PATH" | |
| table = :REPLACE_WITH_TABLE | |
| raw = File.read(csv_path) | |
| parsed = CSV.parse(raw, headers: true) | |
| abort("CSV has no headers") unless parsed.headers && !parsed.headers.empty? | |
| norm_headers = unique_headers(parsed.headers) | |
| # Regenerate CSV with normalized headers that match the migration | |
| normalized_csv = CSV.generate do |csv| | |
| csv << norm_headers | |
| parsed.each { |row| csv << row.fields } | |
| end | |
| conn = ActiveRecord::Base.connection | |
| conn.transaction do | |
| # Optional: clear existing data first | |
| # conn.execute("TRUNCATE TABLE #{table} RESTART IDENTITY CASCADE") | |
| raw_conn = conn.raw_connection | |
| cols = norm_headers.map { |c| '"' + c + '"' }.join(", ") | |
| raw_conn.copy_data("COPY #{table} (#{cols}) FROM STDIN CSV HEADER") do | |
| raw_conn.put_copy_data(normalized_csv) | |
| end | |
| end | |
| puts "Imported #{parsed.size} rows into #{table}" | |
| RUBY | |
| puts "# ===== Suggested Migration (copy into db/migrate/*_#{migration_class.gsub(/([a-z])([A-Z])/, '\\1_\\2').downcase}.rb) =====" | |
| puts migration_body | |
| puts "# ===== Example Seeds Importer (paste into db/seeds.rb) =====" | |
| puts seed_script |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment