gromgit/sqlite_missing_column.sh

gromgit · 2025-02-17T08:25:53Z

First run, showing 20sec slowdown due to terminal I/O (macOS Terminal frantically allocation memory and storing 10+ millions new lines):

% ./t.sh 
===== perfect_0.csv ( 9999999 rows) =====
0000001,This,is,line,0000001,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,17346
0000002,This,is,line,0000002,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,2238
0000003,This,is,line,0000003,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,849
0000004,This,is,line,0000004,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,26951
0000005,This,is,line,0000005,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,21231
===== perfect_1.csv ( 9999999 rows) =====
10000001,This,is,line,0000001,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,17346
10000002,This,is,line,0000002,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,2238
10000003,This,is,line,0000003,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,849
10000004,This,is,line,0000004,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,26951
10000005,This,is,line,0000005,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,21231
===== imperfect_2.csv ( 9999999 rows) =====
20000001,This,is,line,0000001,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite
20000002,This,is,line,0000002,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite
20000003,This,is,line,0000003,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite
20000004,This,is,line,0000004,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite
20000005,This,is,line,0000005,of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite
>>> /opt/homebrew/opt/sqlite/bin/sqlite3 test.db .import\ --csv\ perfect_0.csv\ t 

real	0m55.556s
user	0m52.130s
sys	0m1.550s
>>> /opt/homebrew/opt/sqlite/bin/sqlite3 test.db .import\ --csv\ perfect_1.csv\ t 

real	0m56.920s
user	0m53.195s
sys	0m1.561s
>>> /opt/homebrew/opt/sqlite/bin/sqlite3 test.db .import\ --csv\ imperfect_2.csv\ t 
imperfect_2.csv:1: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:2: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:3: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:4: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:5: expected 25 columns but found 24 - filling the rest with NULL
<<<9,999,989 lines elided>>>
imperfect_2.csv:9999995: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:9999996: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:9999997: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:9999998: expected 25 columns but found 24 - filling the rest with NULL
imperfect_2.csv:9999999: expected 25 columns but found 24 - filling the rest with NULL

real	1m20.284s
user	1m10.497s
sys	0m7.813s

gromgit · 2025-02-17T08:28:42Z

Second run, redirecting sqlite3 stderr to /dev/null:

>>> bash -c /opt/homebrew/opt/sqlite/bin/sqlite3\ test.db\ \'.import\ --csv\ perfect_0.csv\ t\'\ 2\>/dev/null 

real	0m55.496s
user	0m52.323s
sys	0m1.664s
>>> bash -c /opt/homebrew/opt/sqlite/bin/sqlite3\ test.db\ \'.import\ --csv\ perfect_1.csv\ t\'\ 2\>/dev/null 

real	0m58.025s
user	0m53.654s
sys	0m1.696s
>>> bash -c /opt/homebrew/opt/sqlite/bin/sqlite3\ test.db\ \'.import\ --csv\ imperfect_2.csv\ t\'\ 2\>/dev/null 

real	1m1.873s
user	0m55.346s
sys	0m4.275s

gromgit · 2025-02-17T08:29:45Z

Conclusion: performance collapse reported by OP is NOT due to missing column.

	#!/usr/bin/env bash
	cmd() {
	echo ">>> $(printf '%q ' "$@")" >&2
	time "$@"
	}
	sqlite=/opt/homebrew/opt/sqlite/bin/sqlite3
	csvs=(perfect_0.csv perfect_1.csv imperfect_2.csv)

	# Generate CSVs
	for i in $(seq -f %07.0f 9999999); do
	echo "${i},This,is,line,${i},of,a,nonsense,CSV,that,exists,just,to,debunk,the,assumption,that,missing,columns,cause,performance,collapse,in,SQLite,${RANDOM}"
	done > perfect_0.csv
	# 2nd CSV with new PKs
	sed -e 's/^/1/' < perfect_0.csv > perfect_1.csv
	# 3rd CSV with new PKs and missing last column
	sed -e 's/^/2/; s/,[^,]*$//' < perfect_0.csv > imperfect_2.csv

	for i in "${csvs[@]}"; do
	echo "===== ${i} ($(wc -l < "${i}") rows) ====="
	head -5 "$i"
	done

	rm -f test.db
	$sqlite test.db <<EOSQL
	CREATE TABLE t(a INTEGER PRIMARY KEY, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y INTEGER);
	CREATE INDEX xtra_1 ON t(a,b,c,f,k,l,q,m,g,u);
	CREATE INDEX xtra_2 ON t(c,f,k,l);
	EOSQL
	for i in "${csvs[@]}"; do
	cmd bash -c "$sqlite test.db '.import --csv ${i} t' 2>/dev/null"
	done

gromgit/sqlite_missing_column.sh

gromgit commented Feb 17, 2025 •

edited

Loading

gromgit commented Feb 17, 2025

gromgit commented Feb 17, 2025

gromgit/sqlite_missing_column.sh

gromgit commented Feb 17, 2025 • edited Loading

gromgit commented Feb 17, 2025

gromgit commented Feb 17, 2025

gromgit commented Feb 17, 2025 •

edited

Loading