Skip to content

Instantly share code, notes, and snippets.

@ingted
Last active April 8, 2025 06:07
Show Gist options
  • Save ingted/5d8e8b29f0cb2d5fe80172652ddc2930 to your computer and use it in GitHub Desktop.
Save ingted/5d8e8b29f0cb2d5fe80172652ddc2930 to your computer and use it in GitHub Desktop.
Parallel ultra lite F# CSV line parser
#if INTERACTIVE
#r "nuget: Unquote, 7.0.0"
#endif
open Microsoft.FSharp.Quotations.Patterns
open Swensen.Unquote
open Swensen
open Microsoft.FSharp.Quotations
open System.Collections.Generic
type ParsingState = {
mutable inQuotes : bool
mutable wasEscaped : bool
mutable fieldStarted : bool
mutable fieldNoQuoteNow : bool
result : System.Collections.Generic.List<string>
}
with
static member Identity () = {
inQuotes = false
wasEscaped = false
fieldStarted = true
fieldNoQuoteNow = true
result = System.Collections.Generic.List<string>()
}
override this.ToString () =
$"inQ:{this.inQuotes}, escaped:{this.wasEscaped}, fStart:{this.fieldStarted}, noQNow:{this.fieldNoQuoteNow}"
let parseLine (delimiters: char[]) (quoteChar: char option) (escapeCharOpt:char option) (ifMergeDelimiters: bool) (inputLine: string) : string[] =
let escapeChar = if escapeCharOpt.IsNone then '\\' else escapeCharOpt.Value
let s = ParsingState.Identity ()
let currentField = new System.Text.StringBuilder()
let seekQuote = new System.Text.StringBuilder()
let append (c:char) = currentField.Append c |> ignore
let appendStr (s:string) = currentField.Append s |> ignore
let appendEmpty () = currentField.Append "" |> ignore
let seek (c:char) = seekQuote.Append c |> ignore
let seekNotEmpty () = seekQuote.ToString().Trim() <> ""
let clear () = currentField.Clear() |> ignore
let ifSpaceDelimiter = Array.contains ' ' delimiters
//let line = seq {for i in 0..(inputLine.Length - 1) do yield inputLine.[i] } |> Seq.toArray
//let lineQ = new Queue<char>(line)
//let next() =
// if lineQ.Count = 0 then
// None
// else
// Some (lineQ.Dequeue())
//------------------------------v.0
let charSeq = seq {
for c in inputLine do
yield c
}
// 創建一個迭代器,以按需訪問序列中的元素
let charEnumerator = charSeq.GetEnumerator()
// 定義 next 函數來返回序列中的下一個元素,或在沒有更多元素時返回 None
let next() =
if charEnumerator.MoveNext() then
Some(charEnumerator.Current)
else
None
//------------------------------v.1
let reset () =
s.fieldStarted <- true
s.inQuotes <- false
s.fieldNoQuoteNow <- true
s.wasEscaped <- false
let add () =
let sqStr =
if s.fieldNoQuoteNow then
let sqStr = seekQuote.ToString()
//appendStr sqStr
sqStr
else
""
seekQuote.Clear () |> ignore
let curF = currentField.ToString()
let cfStr = sqStr + curF
#if DEBUG
printfn "add: cfStr = '%s'+ '%s'" sqStr curF
printfn "add: '%s' result.Add -> currentField.clear -> reset" cfStr
#endif
s.result.Add cfStr |> ignore
currentField.Clear() |> ignore
reset ()
let emptyNotIgnored () =
s.result.Add (seekQuote.ToString()) |> ignore
let addEmpty () =
s.result.Add "" |> ignore
currentField.Clear() |> ignore
reset ()
//line |> Seq.iter (fun c -> lineQ.Enqueue c)
let rec inQuote (co:char option) =
if co.IsNone then
failwith "quote not finished"
else
let c = co.Value
match c, s.fieldStarted, s.wasEscaped with
| _, _, false when c = escapeChar -> //escape
#if DEBUG
//#else
printfn "%O inQuote %c inQuoteEscape" s c
#endif
s.wasEscaped <- true
inQuoteEscape (next())
| _, _, false when Some c = quoteChar ->
#if DEBUG
//#else
printfn "%O inQuote %c append -> add -> clear -> exitQuote" s c
#endif
//append c
add ()
clear ()
s.fieldNoQuoteNow <- true
s.fieldStarted <- false
exitQuote (next())
| _, _, _ when Array.exists ((=) c) delimiters ->
#if DEBUG
//#else
printfn "%O inQuote %c append -> inQuote" s c
#endif
append c
inQuote (next())
| _, true, _ -> //quoted field begin
#if DEBUG
//#else
printfn "%O inQuote %c clear -> append -> inQuote" s c
#endif
clear ()
append c
s.fieldStarted <- false
inQuote (next())
| _, _, _ ->
#if DEBUG
//#else
printfn "%O inQuote %c append -> inQuote" s c
#endif
append c
inQuote (next())
and notInQuote (co:char option) =
if co.IsNone then
add ()
#if DEBUG
//#else
printfn "notInQuote, co.IsNone"
#endif
else
let c = co.Value
match c, s.fieldStarted, s.fieldNoQuoteNow with
| _, _, _ when c = escapeChar ->
#if DEBUG
//#else
printfn "%O notInQuote %c escape" s c
#endif
s.fieldStarted <- false
escape (next())
| _, true, _ when Array.exists ((=) c) delimiters -> //field開頭立馬結束
#if DEBUG
//#else
printfn "%O notInQuote %c delimiters, fieldStarted: true" s c
#endif
add ()
//notInQuote (next())
if ifMergeDelimiters && ifSpaceDelimiter && c = ' ' then
let rec skipSpaces (co: char option) =
match co with
| Some ' ' ->
#if DEBUG
//#else
printfn "cur is ' ' and if next is ' ' then skip it"
#endif
skipSpaces (next())
| _ -> co
notInQuote (skipSpaces (next()))
else
notInQuote (next())
| _, false, _ when Array.exists ((=) c) delimiters -> //field結束
#if DEBUG
//#else
printfn "%O notInQuote %c delimiters, fieldStarted: false, %A" s c s.result
#endif
add ()
if ifMergeDelimiters && ifSpaceDelimiter && c = ' ' then
let rec skipSpaces (co: char option) =
match co with
| Some ' ' -> skipSpaces (next())
| _ -> co
notInQuote (skipSpaces (next()))
else
notInQuote (next())
| ' ', true, _ ->
#if DEBUG
//#else
printfn "%O notInQuote %c empty character, %A" s c s.result
#endif
seek ' '
s.fieldStarted <- false
notInQuote (next())
| _, _, true when Some c = quoteChar ->
if seekNotEmpty () then
//printfn $"seekNotEmpty {inputLine}"
failwith "Invalid quote occurance"
else
#if DEBUG
//#else
printfn "%O notInQuote %c quote character -> inQuote" s c
#endif
s.fieldStarted <- true
s.fieldNoQuoteNow <- false
s.inQuotes <- true
inQuote (next())
| _, true, true ->
#if DEBUG
//#else
printfn "%O notInQuote %c -> seek -> notInQuote, fieldStarted: true" s c
#endif
seek c
s.fieldStarted <- false
notInQuote (next())
| _, false, true ->
#if DEBUG
//#else
printfn "%O notInQuote %c -> seek -> notInQuote, fieldStarted: false" s c
#endif
seek c
notInQuote (next())
| _, false, _ ->
s.fieldStarted <- false
#if DEBUG
//#else
printfn "%O notInQuote %c -> added -> notInQuote" s c
#endif
add ()
notInQuote (next())
and escape (co:char option) =
if co.IsNone then
failwith "escape: escape character should paired with escaped character"
else
let c = co.Value
match c, s.fieldStarted, s.fieldNoQuoteNow with
| _, _, true when c = escapeChar ->
#if DEBUG
//#else
printfn "%O escape %c seek" s c
#endif
s.wasEscaped <- false
seek c
notInQuote (next())
| _, _, false when c = escapeChar ->
#if DEBUG
//#else
printfn "%O escape %c append" s c
#endif
s.wasEscaped <- false
append c
notInQuote (next())
| _, _, true when Array.exists ((=) c) delimiters ->
#if DEBUG
//#else
printfn "%O escape %c seek" s c
#endif
s.wasEscaped <- false
seek c
notInQuote (next())
| _, _, false when Array.exists ((=) c) delimiters ->
#if DEBUG
//#else
printfn "%O escape %c append" s c
#endif
s.wasEscaped <- false
append c
notInQuote (next())
| _, _, true ->
#if DEBUG
//#else
printfn "%O escape %c seek, not delimiter" s c
#endif
s.wasEscaped <- false
if Some c <> quoteChar then
seek escapeChar
seek c
notInQuote (next())
| _, _, _ ->
#if DEBUG
//#else
printfn "%O escape %c append, not delimiter" s c
#endif
s.wasEscaped <- false
if Some c <> quoteChar then
append escapeChar
append c
notInQuote (next())
//| _ ->
// failwithf "invalid escape %c %O" c s
and inQuoteEscape (co:char option) =
if co.IsNone then
failwith "inQuoteEscape: escape character should paired with escaped character"
else
let c = co.Value
match c, s.fieldStarted, s.fieldNoQuoteNow with
//| _, false, _ when Array.exists ((=) c) delimiters ->
| _, false, _ when Some c = quoteChar ->
#if DEBUG
//#else
printfn "%O inQuoteEscape %c append" s c
#endif
s.wasEscaped <- false
append c
inQuote (next())
| _, _, _ when c = escapeChar ->
#if DEBUG
//#else
printfn "%O inQuoteEscape %c append" s c
#endif
s.wasEscaped <- false
append c
inQuote (next())
| 'r', _, _ ->
#if DEBUG
//#else
printfn "%O inQuoteEscape \%c append" s c
#endif
s.wasEscaped <- false
append '\r'
inQuote (next())
| 'n', _, _ ->
#if DEBUG
//#else
printfn "%O inQuoteEscape \%c append" s c
#endif
s.wasEscaped <- false
append '\n'
inQuote (next())
| _ ->
#if DEBUG
//#else
printfn "%O invalid inQuoteEscape %c " s c
#endif
failwith "Invalid inQuoteEscape"
and exitQuote (co:char option) = //leaveQuote
if co.IsNone then
//emptyNotIgnored ()
//printfn $"exitQuote emptyNotIgnored"
#if DEBUG
//#else
printfn $"exitQuote"
#endif
()
else
let c = co.Value
match c, s.fieldStarted, s.fieldNoQuoteNow with
| _, true, _
| _, _, false ->
failwithf "%O exitQuote should be end of a field %c" s c
| ' ', _, _ when Array.contains ' ' delimiters ->
#if DEBUG
//#else
printfn "%O exitQuote '%c' exitQuote" s c
#endif
reset ()
notInQuote (next())
| ' ', _, _ ->
#if DEBUG
//#else
printfn "%O exitQuote '%c' exitQuote" s c
#endif
exitQuote (next())
| _, _, _ when Array.exists ((=) c) delimiters ->
#if DEBUG
//#else
printfn "%O exitQuote %c add -> reset -> notInQuote, %A" s c s.result
#endif
//add ()
reset ()
notInQuote (next())
| _ ->
failwith "Field already closed"
notInQuote (next())
s.result |> Seq.toArray
let q2str (q:Expr) =
(Operators.unquote q).DecompiledReductions.[0]
let testBody f = (q2str f).Substring(10)
let inline test<'T when 'T: equality> ifForcePrint (f: Expr<unit -> 'T>) (subject:'T) =
try
let o = (Operators.eval f) ()
let diff = o = subject
if not diff || ifForcePrint then
printfn "[%b] %s | %A == %A" diff (testBody f) o subject
diff, f.Raw
with
| exn ->
let diff = exn.InnerException.Message = subject.ToString()
if not diff || ifForcePrint then
printfn "[%b] %s | %s == %A" diff (testBody f) exn.InnerException.Message subject
diff, f.Raw
let forcePrint = true
[
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, , """).[1] @>) " "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, , """).[2] @>) " "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456", """).[1] @>) "456"
test forcePrint (<@ fun () -> (parseLine [|','|] None None false """ffff, "456", """).[1] @>) " \"456\""
test forcePrint (<@ fun () -> (parseLine [|','|] None None false """ffff, "456", """).[2] @>) " "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456"", """).[1] @>) "Field already closed"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\"", """).[1] @>) "456\""
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", """).[1] @>) "456\\\""
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", """).[1] @>) "456\\\""
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" , """).[1] @>) "456\\\""
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" 1, """).[1] @>) "Field already closed"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", """).[2] @>) " "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", c""").[2] @>) " c"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", \c""").[2] @>) " \c"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"",""").[2] @>) ""
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" """).[2] @>) "Index was outside the bounds of the array."
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" """).[0] @>) "ffff"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ ffff , "456\\\"" """).[0] @>) " ffff "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ ff\,ff , "456\\\"" """).[0] @>) " ff,ff "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """\ ff\,ff , "456\\\"" """).[0] @>) "\ ff,ff "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """\" ff\,\"ff , "456\\\"" """).[0] @>) "\" ff,\"ff "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \" ff\,\"ff , "456\\\"" """).[0] @>) " \" ff,\"ff "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" """).[0]
@>) " \\\" ff,\"ff "
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\" ff\,\"ff , "456\\\"" """).[0] @>) "Invalid quote occurance"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" """).Length @>) 2
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" 1""").[0] @>) "Field already closed"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" \"1\" """).[0] @>) "Field already closed"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """,,,,""").Length @>) 5
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \r\n """).[0] @>) """ \r\n """
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \r\n\\ """).[0] @>) """ \r\n\ """
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ "\r\n" """).[0] @>) "\r\n"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ "\r\n\s" """).[0] @>) "Invalid inQuoteEscape"
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ "\r\n\\" """).[0] @>) "\r\n\\"
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[0] @>) ""
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[1] @>) "123"
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[2] @>) "456"
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[3] @>) ""
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 " " """).[3] @>) " "
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 " " 1 """).[3] @>) " "
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ "\r\n\\" 1""").[1] @>) "\r\n\\"
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ "\r\n\\" 1""").[2] @>) "1"
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ "\r\n\\" 1""").Length @>) 3
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ --test 123 "" \"\" " " """).Length @>) 7
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ --test 123 "" \"\" " " """).Length @>) 8
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None true """ --test 123 "" \"\" " " """).Length @>) 7
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None true """ --test 123 "" \" \" " " """).Length @>) 8
test forcePrint (<@ fun () -> (parseLine [|' '; ','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" \"1\" """).Length @>) 8
test forcePrint (<@ fun () -> (parseLine [|' '; ','|] (Some '"') None false """ \\ " ff\," gg , " """).[0] @>) "Invalid inQuoteEscape"
//test (<@ fun () -> (parseLine [|','|] (Some '"') None """ \\\" ff\,\"ff , "456\\\"" """).Length @>) 2
]
|> fun l ->
let ll = l |> List.mapi (fun i v -> i, fst v, snd v)
let failed = ll |> List.tryFind (fun (i, r, f) -> not r)
if failed.IsSome then
ll
|> List.find (fun (i,r,f) ->
let (j ,_, _) = failed.Value
j = i
)
|> fun (i,r,f) ->
printfn "================================="
testBody f |> printfn "%s"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment