Created
December 19, 2020 21:16
-
-
Save vaclavdekanovsky/484e12d2052548f7fdb4b38cd38631e8 to your computer and use it in GitHub Desktop.
Julia CSV reader's type and types parameters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Type and types parameters of the Julia CSV parser\n", | |
"Written in [Julia](https://julialang.org/). See [CSV.jl](https://csv.juliadata.org/stable/) and [DataFrames.jl](https://dataframes.juliadata.org/stable/) for more details" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"using CSV, DataFrames" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"All examples are based on string input, which is passed to Julia's CSV reader through `IOBuffer`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 2, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"\"c1|c2|c3|c4\\n\\\"1\\\"|2|c|1.5\\n\\\"C|D\\\"|16|x|2.33\\n\"" | |
] | |
}, | |
"execution_count": 2, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"data = \"\"\"c1|c2|c3|c4\n", | |
"\"1\"|2|c|1.5\n", | |
"\"C|D\"|16|x|2.33\n", | |
"\"\"\"" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can set the same type for all columns using `type` parameter, e.g. string" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>String</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|cccc}\n", | |
"\t& c1 & c2 & c3 & c4\\\\\n", | |
"\t\\hline\n", | |
"\t& String & String & String & String\\\\\n", | |
"\t\\hline\n", | |
"\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
"\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
"2×4 DataFrame\n", | |
"│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
"│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │\n", | |
"├─────┼────────┼────────┼────────┼────────┤\n", | |
"│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
"│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
] | |
}, | |
"execution_count": 3, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# type turns all the columns to the same type\n", | |
"CSV.read(IOBuffer(data), DataFrame; type=String)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Or specify type for each or just some columns using a Dict. If the data cannot be parsed to the type, it's turned to `missing` type, equivalent of pandas's `Nan`." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"┌ Warning: thread = 1 warning: error parsing Int64 around row = 2, col = 4: \"1.5\n", | |
"│ \", error=INVALID: OK | NEWLINE | INVALID_DELIMITER \n", | |
"└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n", | |
"┌ Warning: thread = 1 warning: error parsing Int64 around row = 3, col = 4: \"2.33\n", | |
"│ \", error=INVALID: OK | NEWLINE | EOF | INVALID_DELIMITER \n", | |
"└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"CSV.Row:\n", | |
" :c1 \"1\"\n", | |
" :c2 \"2\"\n", | |
" :c3 \"c\"\n", | |
" :c4 missing\n", | |
"CSV.Row:\n", | |
" :c1 \"C|D\"\n", | |
" :c2 \"16\"\n", | |
" :c3 \"x\"\n", | |
" :c4 missing\n" | |
] | |
} | |
], | |
"source": [ | |
"for r in CSV.File(IOBuffer(data), types=Dict(:c2=>String, :c4=>Int64))\n", | |
" println(r)\n", | |
"end" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"┌ Warning: thread = 1 warning: error parsing Int64 around row = 2, col = 4: \"1.5\n", | |
"│ \", error=INVALID: OK | NEWLINE | INVALID_DELIMITER \n", | |
"└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n", | |
"┌ Warning: thread = 1 warning: error parsing Int64 around row = 3, col = 4: \"2.33\n", | |
"│ \", error=INVALID: OK | NEWLINE | EOF | INVALID_DELIMITER \n", | |
"└ @ CSV /home/vaclav/.julia/packages/CSV/la2cd/src/file.jl:606\n" | |
] | |
}, | |
{ | |
"data": { | |
"text/html": [ | |
"<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>Int64?</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td><em>missing</em></td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td><em>missing</em></td></tr></tbody></table>" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|cccc}\n", | |
"\t& c1 & c2 & c3 & c4\\\\\n", | |
"\t\\hline\n", | |
"\t& String & String & String & Int64?\\\\\n", | |
"\t\\hline\n", | |
"\t1 & 1 & 2 & c & \\emph{missing} \\\\\n", | |
"\t2 & C|D & 16 & x & \\emph{missing} \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
"2×4 DataFrame\n", | |
"│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
"│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64?\u001b[39m │\n", | |
"├─────┼────────┼────────┼────────┼─────────┤\n", | |
"│ 1 │ 1 │ 2 │ c │ \u001b[90mmissing\u001b[39m │\n", | |
"│ 2 │ C|D │ 16 │ x │ \u001b[90mmissing\u001b[39m │" | |
] | |
}, | |
"execution_count": 5, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# specify types of the columns\n", | |
"CSV.read(IOBuffer(data), DataFrame; types=Dict(:c2=>String, :c4=>Int64))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"You can silence these warnings by `silencewarnings=true`" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 6, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>Int64?</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td><em>missing</em></td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td><em>missing</em></td></tr></tbody></table>" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|cccc}\n", | |
"\t& c1 & c2 & c3 & c4\\\\\n", | |
"\t\\hline\n", | |
"\t& String & String & String & Int64?\\\\\n", | |
"\t\\hline\n", | |
"\t1 & 1 & 2 & c & \\emph{missing} \\\\\n", | |
"\t2 & C|D & 16 & x & \\emph{missing} \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
"2×4 DataFrame\n", | |
"│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
"│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mInt64?\u001b[39m │\n", | |
"├─────┼────────┼────────┼────────┼─────────┤\n", | |
"│ 1 │ 1 │ 2 │ c │ \u001b[90mmissing\u001b[39m │\n", | |
"│ 2 │ C|D │ 16 │ x │ \u001b[90mmissing\u001b[39m │" | |
] | |
}, | |
"execution_count": 6, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# specify types of the columns\n", | |
"CSV.read(IOBuffer(data), DataFrame; types=Dict(:c2=>String, :c4=>Int64), silencewarnings=true)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 7, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>String</th><th>String</th><th>Float32</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|cccc}\n", | |
"\t& c1 & c2 & c3 & c4\\\\\n", | |
"\t\\hline\n", | |
"\t& String & String & String & Float32\\\\\n", | |
"\t\\hline\n", | |
"\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
"\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
"2×4 DataFrame\n", | |
"│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
"│ │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mFloat32\u001b[39m │\n", | |
"├─────┼────────┼────────┼────────┼─────────┤\n", | |
"│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
"│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
] | |
}, | |
"execution_count": 7, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"# specify valid type for columns\n", | |
"CSV.read(IOBuffer(data), DataFrame; types=Dict(:c2=>String, :c4=>Float32))" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Or specify types for all columns using a **Vector**" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 8, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>Int64</th><th>String</th><th>Float64</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|cccc}\n", | |
"\t& c1 & c2 & c3 & c4\\\\\n", | |
"\t\\hline\n", | |
"\t& String & Int64 & String & Float64\\\\\n", | |
"\t\\hline\n", | |
"\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
"\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
"2×4 DataFrame\n", | |
"│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
"│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt64\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mFloat64\u001b[39m │\n", | |
"├─────┼────────┼───────┼────────┼─────────┤\n", | |
"│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
"│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
] | |
}, | |
"execution_count": 8, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"types = Array{DataType,1}([String, Int, String, Float64])\n", | |
"CSV.read(IOBuffer(data), DataFrame; types=types)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 9, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/html": [ | |
"<table class=\"data-frame\"><thead><tr><th></th><th>c1</th><th>c2</th><th>c3</th><th>c4</th></tr><tr><th></th><th>String</th><th>Int32</th><th>String</th><th>Float32</th></tr></thead><tbody><p>2 rows × 4 columns</p><tr><th>1</th><td>1</td><td>2</td><td>c</td><td>1.5</td></tr><tr><th>2</th><td>C|D</td><td>16</td><td>x</td><td>2.33</td></tr></tbody></table>" | |
], | |
"text/latex": [ | |
"\\begin{tabular}{r|cccc}\n", | |
"\t& c1 & c2 & c3 & c4\\\\\n", | |
"\t\\hline\n", | |
"\t& String & Int32 & String & Float32\\\\\n", | |
"\t\\hline\n", | |
"\t1 & 1 & 2 & c & 1.5 \\\\\n", | |
"\t2 & C|D & 16 & x & 2.33 \\\\\n", | |
"\\end{tabular}\n" | |
], | |
"text/plain": [ | |
"2×4 DataFrame\n", | |
"│ Row │ c1 │ c2 │ c3 │ c4 │\n", | |
"│ │ \u001b[90mString\u001b[39m │ \u001b[90mInt32\u001b[39m │ \u001b[90mString\u001b[39m │ \u001b[90mFloat32\u001b[39m │\n", | |
"├─────┼────────┼───────┼────────┼─────────┤\n", | |
"│ 1 │ 1 │ 2 │ c │ 1.5 │\n", | |
"│ 2 │ C|D │ 16 │ x │ 2.33 │" | |
] | |
}, | |
"execution_count": 9, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"CSV.read(IOBuffer(data), DataFrame; types=[String, Int32, String, Float32])" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Julia 1.4.1", | |
"language": "julia", | |
"name": "julia-1.4" | |
}, | |
"language_info": { | |
"file_extension": ".jl", | |
"mimetype": "application/julia", | |
"name": "julia", | |
"version": "1.4.1" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment