Last active
April 8, 2025 06:07
-
-
Save ingted/5d8e8b29f0cb2d5fe80172652ddc2930 to your computer and use it in GitHub Desktop.
Parallel ultra lite F# CSV line parser
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#if INTERACTIVE | |
#r "nuget: Unquote, 7.0.0" | |
#endif | |
open Microsoft.FSharp.Quotations.Patterns | |
open Swensen.Unquote | |
open Swensen | |
open Microsoft.FSharp.Quotations | |
open System.Collections.Generic | |
type ParsingState = { | |
mutable inQuotes : bool | |
mutable wasEscaped : bool | |
mutable fieldStarted : bool | |
mutable fieldNoQuoteNow : bool | |
result : System.Collections.Generic.List<string> | |
} | |
with | |
static member Identity () = { | |
inQuotes = false | |
wasEscaped = false | |
fieldStarted = true | |
fieldNoQuoteNow = true | |
result = System.Collections.Generic.List<string>() | |
} | |
override this.ToString () = | |
$"inQ:{this.inQuotes}, escaped:{this.wasEscaped}, fStart:{this.fieldStarted}, noQNow:{this.fieldNoQuoteNow}" | |
let parseLine (delimiters: char[]) (quoteChar: char option) (escapeCharOpt:char option) (ifMergeDelimiters: bool) (inputLine: string) : string[] = | |
let escapeChar = if escapeCharOpt.IsNone then '\\' else escapeCharOpt.Value | |
let s = ParsingState.Identity () | |
let currentField = new System.Text.StringBuilder() | |
let seekQuote = new System.Text.StringBuilder() | |
let append (c:char) = currentField.Append c |> ignore | |
let appendStr (s:string) = currentField.Append s |> ignore | |
let appendEmpty () = currentField.Append "" |> ignore | |
let seek (c:char) = seekQuote.Append c |> ignore | |
let seekNotEmpty () = seekQuote.ToString().Trim() <> "" | |
let clear () = currentField.Clear() |> ignore | |
let ifSpaceDelimiter = Array.contains ' ' delimiters | |
//let line = seq {for i in 0..(inputLine.Length - 1) do yield inputLine.[i] } |> Seq.toArray | |
//let lineQ = new Queue<char>(line) | |
//let next() = | |
// if lineQ.Count = 0 then | |
// None | |
// else | |
// Some (lineQ.Dequeue()) | |
//------------------------------v.0 | |
let charSeq = seq { | |
for c in inputLine do | |
yield c | |
} | |
// 創建一個迭代器,以按需訪問序列中的元素 | |
let charEnumerator = charSeq.GetEnumerator() | |
// 定義 next 函數來返回序列中的下一個元素,或在沒有更多元素時返回 None | |
let next() = | |
if charEnumerator.MoveNext() then | |
Some(charEnumerator.Current) | |
else | |
None | |
//------------------------------v.1 | |
let reset () = | |
s.fieldStarted <- true | |
s.inQuotes <- false | |
s.fieldNoQuoteNow <- true | |
s.wasEscaped <- false | |
let add () = | |
let sqStr = | |
if s.fieldNoQuoteNow then | |
let sqStr = seekQuote.ToString() | |
//appendStr sqStr | |
sqStr | |
else | |
"" | |
seekQuote.Clear () |> ignore | |
let curF = currentField.ToString() | |
let cfStr = sqStr + curF | |
#if DEBUG | |
printfn "add: cfStr = '%s'+ '%s'" sqStr curF | |
printfn "add: '%s' result.Add -> currentField.clear -> reset" cfStr | |
#endif | |
s.result.Add cfStr |> ignore | |
currentField.Clear() |> ignore | |
reset () | |
let emptyNotIgnored () = | |
s.result.Add (seekQuote.ToString()) |> ignore | |
let addEmpty () = | |
s.result.Add "" |> ignore | |
currentField.Clear() |> ignore | |
reset () | |
//line |> Seq.iter (fun c -> lineQ.Enqueue c) | |
let rec inQuote (co:char option) = | |
if co.IsNone then | |
failwith "quote not finished" | |
else | |
let c = co.Value | |
match c, s.fieldStarted, s.wasEscaped with | |
| _, _, false when c = escapeChar -> //escape | |
#if DEBUG | |
//#else | |
printfn "%O inQuote %c inQuoteEscape" s c | |
#endif | |
s.wasEscaped <- true | |
inQuoteEscape (next()) | |
| _, _, false when Some c = quoteChar -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuote %c append -> add -> clear -> exitQuote" s c | |
#endif | |
//append c | |
add () | |
clear () | |
s.fieldNoQuoteNow <- true | |
s.fieldStarted <- false | |
exitQuote (next()) | |
| _, _, _ when Array.exists ((=) c) delimiters -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuote %c append -> inQuote" s c | |
#endif | |
append c | |
inQuote (next()) | |
| _, true, _ -> //quoted field begin | |
#if DEBUG | |
//#else | |
printfn "%O inQuote %c clear -> append -> inQuote" s c | |
#endif | |
clear () | |
append c | |
s.fieldStarted <- false | |
inQuote (next()) | |
| _, _, _ -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuote %c append -> inQuote" s c | |
#endif | |
append c | |
inQuote (next()) | |
and notInQuote (co:char option) = | |
if co.IsNone then | |
add () | |
#if DEBUG | |
//#else | |
printfn "notInQuote, co.IsNone" | |
#endif | |
else | |
let c = co.Value | |
match c, s.fieldStarted, s.fieldNoQuoteNow with | |
| _, _, _ when c = escapeChar -> | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c escape" s c | |
#endif | |
s.fieldStarted <- false | |
escape (next()) | |
| _, true, _ when Array.exists ((=) c) delimiters -> //field開頭立馬結束 | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c delimiters, fieldStarted: true" s c | |
#endif | |
add () | |
//notInQuote (next()) | |
if ifMergeDelimiters && ifSpaceDelimiter && c = ' ' then | |
let rec skipSpaces (co: char option) = | |
match co with | |
| Some ' ' -> | |
#if DEBUG | |
//#else | |
printfn "cur is ' ' and if next is ' ' then skip it" | |
#endif | |
skipSpaces (next()) | |
| _ -> co | |
notInQuote (skipSpaces (next())) | |
else | |
notInQuote (next()) | |
| _, false, _ when Array.exists ((=) c) delimiters -> //field結束 | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c delimiters, fieldStarted: false, %A" s c s.result | |
#endif | |
add () | |
if ifMergeDelimiters && ifSpaceDelimiter && c = ' ' then | |
let rec skipSpaces (co: char option) = | |
match co with | |
| Some ' ' -> skipSpaces (next()) | |
| _ -> co | |
notInQuote (skipSpaces (next())) | |
else | |
notInQuote (next()) | |
| ' ', true, _ -> | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c empty character, %A" s c s.result | |
#endif | |
seek ' ' | |
s.fieldStarted <- false | |
notInQuote (next()) | |
| _, _, true when Some c = quoteChar -> | |
if seekNotEmpty () then | |
//printfn $"seekNotEmpty {inputLine}" | |
failwith "Invalid quote occurance" | |
else | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c quote character -> inQuote" s c | |
#endif | |
s.fieldStarted <- true | |
s.fieldNoQuoteNow <- false | |
s.inQuotes <- true | |
inQuote (next()) | |
| _, true, true -> | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c -> seek -> notInQuote, fieldStarted: true" s c | |
#endif | |
seek c | |
s.fieldStarted <- false | |
notInQuote (next()) | |
| _, false, true -> | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c -> seek -> notInQuote, fieldStarted: false" s c | |
#endif | |
seek c | |
notInQuote (next()) | |
| _, false, _ -> | |
s.fieldStarted <- false | |
#if DEBUG | |
//#else | |
printfn "%O notInQuote %c -> added -> notInQuote" s c | |
#endif | |
add () | |
notInQuote (next()) | |
and escape (co:char option) = | |
if co.IsNone then | |
failwith "escape: escape character should paired with escaped character" | |
else | |
let c = co.Value | |
match c, s.fieldStarted, s.fieldNoQuoteNow with | |
| _, _, true when c = escapeChar -> | |
#if DEBUG | |
//#else | |
printfn "%O escape %c seek" s c | |
#endif | |
s.wasEscaped <- false | |
seek c | |
notInQuote (next()) | |
| _, _, false when c = escapeChar -> | |
#if DEBUG | |
//#else | |
printfn "%O escape %c append" s c | |
#endif | |
s.wasEscaped <- false | |
append c | |
notInQuote (next()) | |
| _, _, true when Array.exists ((=) c) delimiters -> | |
#if DEBUG | |
//#else | |
printfn "%O escape %c seek" s c | |
#endif | |
s.wasEscaped <- false | |
seek c | |
notInQuote (next()) | |
| _, _, false when Array.exists ((=) c) delimiters -> | |
#if DEBUG | |
//#else | |
printfn "%O escape %c append" s c | |
#endif | |
s.wasEscaped <- false | |
append c | |
notInQuote (next()) | |
| _, _, true -> | |
#if DEBUG | |
//#else | |
printfn "%O escape %c seek, not delimiter" s c | |
#endif | |
s.wasEscaped <- false | |
if Some c <> quoteChar then | |
seek escapeChar | |
seek c | |
notInQuote (next()) | |
| _, _, _ -> | |
#if DEBUG | |
//#else | |
printfn "%O escape %c append, not delimiter" s c | |
#endif | |
s.wasEscaped <- false | |
if Some c <> quoteChar then | |
append escapeChar | |
append c | |
notInQuote (next()) | |
//| _ -> | |
// failwithf "invalid escape %c %O" c s | |
and inQuoteEscape (co:char option) = | |
if co.IsNone then | |
failwith "inQuoteEscape: escape character should paired with escaped character" | |
else | |
let c = co.Value | |
match c, s.fieldStarted, s.fieldNoQuoteNow with | |
//| _, false, _ when Array.exists ((=) c) delimiters -> | |
| _, false, _ when Some c = quoteChar -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuoteEscape %c append" s c | |
#endif | |
s.wasEscaped <- false | |
append c | |
inQuote (next()) | |
| _, _, _ when c = escapeChar -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuoteEscape %c append" s c | |
#endif | |
s.wasEscaped <- false | |
append c | |
inQuote (next()) | |
| 'r', _, _ -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuoteEscape \%c append" s c | |
#endif | |
s.wasEscaped <- false | |
append '\r' | |
inQuote (next()) | |
| 'n', _, _ -> | |
#if DEBUG | |
//#else | |
printfn "%O inQuoteEscape \%c append" s c | |
#endif | |
s.wasEscaped <- false | |
append '\n' | |
inQuote (next()) | |
| _ -> | |
#if DEBUG | |
//#else | |
printfn "%O invalid inQuoteEscape %c " s c | |
#endif | |
failwith "Invalid inQuoteEscape" | |
and exitQuote (co:char option) = //leaveQuote | |
if co.IsNone then | |
//emptyNotIgnored () | |
//printfn $"exitQuote emptyNotIgnored" | |
#if DEBUG | |
//#else | |
printfn $"exitQuote" | |
#endif | |
() | |
else | |
let c = co.Value | |
match c, s.fieldStarted, s.fieldNoQuoteNow with | |
| _, true, _ | |
| _, _, false -> | |
failwithf "%O exitQuote should be end of a field %c" s c | |
| ' ', _, _ when Array.contains ' ' delimiters -> | |
#if DEBUG | |
//#else | |
printfn "%O exitQuote '%c' exitQuote" s c | |
#endif | |
reset () | |
notInQuote (next()) | |
| ' ', _, _ -> | |
#if DEBUG | |
//#else | |
printfn "%O exitQuote '%c' exitQuote" s c | |
#endif | |
exitQuote (next()) | |
| _, _, _ when Array.exists ((=) c) delimiters -> | |
#if DEBUG | |
//#else | |
printfn "%O exitQuote %c add -> reset -> notInQuote, %A" s c s.result | |
#endif | |
//add () | |
reset () | |
notInQuote (next()) | |
| _ -> | |
failwith "Field already closed" | |
notInQuote (next()) | |
s.result |> Seq.toArray | |
let q2str (q:Expr) = | |
(Operators.unquote q).DecompiledReductions.[0] | |
let testBody f = (q2str f).Substring(10) | |
let inline test<'T when 'T: equality> ifForcePrint (f: Expr<unit -> 'T>) (subject:'T) = | |
try | |
let o = (Operators.eval f) () | |
let diff = o = subject | |
if not diff || ifForcePrint then | |
printfn "[%b] %s | %A == %A" diff (testBody f) o subject | |
diff, f.Raw | |
with | |
| exn -> | |
let diff = exn.InnerException.Message = subject.ToString() | |
if not diff || ifForcePrint then | |
printfn "[%b] %s | %s == %A" diff (testBody f) exn.InnerException.Message subject | |
diff, f.Raw | |
let forcePrint = true | |
[ | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, , """).[1] @>) " " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, , """).[2] @>) " " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456", """).[1] @>) "456" | |
test forcePrint (<@ fun () -> (parseLine [|','|] None None false """ffff, "456", """).[1] @>) " \"456\"" | |
test forcePrint (<@ fun () -> (parseLine [|','|] None None false """ffff, "456", """).[2] @>) " " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456"", """).[1] @>) "Field already closed" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\"", """).[1] @>) "456\"" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", """).[1] @>) "456\\\"" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", """).[1] @>) "456\\\"" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" , """).[1] @>) "456\\\"" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" 1, """).[1] @>) "Field already closed" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", """).[2] @>) " " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", c""").[2] @>) " c" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"", \c""").[2] @>) " \c" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"",""").[2] @>) "" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" """).[2] @>) "Index was outside the bounds of the array." | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ffff, "456\\\"" """).[0] @>) "ffff" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ ffff , "456\\\"" """).[0] @>) " ffff " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ ff\,ff , "456\\\"" """).[0] @>) " ff,ff " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """\ ff\,ff , "456\\\"" """).[0] @>) "\ ff,ff " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """\" ff\,\"ff , "456\\\"" """).[0] @>) "\" ff,\"ff " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \" ff\,\"ff , "456\\\"" """).[0] @>) " \" ff,\"ff " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" """).[0] | |
@>) " \\\" ff,\"ff " | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\" ff\,\"ff , "456\\\"" """).[0] @>) "Invalid quote occurance" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" """).Length @>) 2 | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" 1""").[0] @>) "Field already closed" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" \"1\" """).[0] @>) "Field already closed" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """,,,,""").Length @>) 5 | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \r\n """).[0] @>) """ \r\n """ | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ \r\n\\ """).[0] @>) """ \r\n\ """ | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ "\r\n" """).[0] @>) "\r\n" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ "\r\n\s" """).[0] @>) "Invalid inQuoteEscape" | |
test forcePrint (<@ fun () -> (parseLine [|','|] (Some '"') None false """ "\r\n\\" """).[0] @>) "\r\n\\" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[0] @>) "" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[1] @>) "123" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[2] @>) "456" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 """).[3] @>) "" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 " " """).[3] @>) " " | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ 123 456 " " 1 """).[3] @>) " " | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ "\r\n\\" 1""").[1] @>) "\r\n\\" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ "\r\n\\" 1""").[2] @>) "1" | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ "\r\n\\" 1""").Length @>) 3 | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ --test 123 "" \"\" " " """).Length @>) 7 | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None false """ --test 123 "" \"\" " " """).Length @>) 8 | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None true """ --test 123 "" \"\" " " """).Length @>) 7 | |
test forcePrint (<@ fun () -> (parseLine [|' '|] (Some '"') None true """ --test 123 "" \" \" " " """).Length @>) 8 | |
test forcePrint (<@ fun () -> (parseLine [|' '; ','|] (Some '"') None false """ \\\" ff\,\"ff , "456\\\"" \"1\" """).Length @>) 8 | |
test forcePrint (<@ fun () -> (parseLine [|' '; ','|] (Some '"') None false """ \\ " ff\," gg , " """).[0] @>) "Invalid inQuoteEscape" | |
//test (<@ fun () -> (parseLine [|','|] (Some '"') None """ \\\" ff\,\"ff , "456\\\"" """).Length @>) 2 | |
] | |
|> fun l -> | |
let ll = l |> List.mapi (fun i v -> i, fst v, snd v) | |
let failed = ll |> List.tryFind (fun (i, r, f) -> not r) | |
if failed.IsSome then | |
ll | |
|> List.find (fun (i,r,f) -> | |
let (j ,_, _) = failed.Value | |
j = i | |
) | |
|> fun (i,r,f) -> | |
printfn "=================================" | |
testBody f |> printfn "%s" | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment