Skip to content

Instantly share code, notes, and snippets.

@ImaginaryDevelopment
Created October 5, 2023 17:42

Revisions

  1. ImaginaryDevelopment created this gist Oct 5, 2023.
    73 changes: 73 additions & 0 deletions scrape.fsx
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,73 @@
    // walk two sets of data remove overlap

    let toLower (x:string) = x.ToLowerInvariant()
    let afterLast (delimiter:string) (value:string) =
    value[value.LastIndexOf(delimiter) + 1 ..]

    let photos = File.ReadAllLines(@"C:\Users\B\Documents\lancephotos.csv") |> Array.skip 2 |> Array.map (fun v -> v.Trim '"' |> toLower) |> Array.truncate 1_000
    let students = File.ReadAllLines(@"C:\Users\B\Documents\lancestudents.csv") |> Array.skip 2 |> Array.map (fun v -> v.Trim '"' |> toLower |> afterLast "\\") // |> Array.truncate 5

    //(photos,students).Dump()

    let msToSeconds (ms: int64) =
    let seconds = ( ms / int64 1000)
    seconds

    // raw data had quotes in it, I didn't read it as a csv so the data was bad
    (photos.Any(fun photo -> photo.Contains "\""), students.Any(fun s -> s.Contains "\"")).Dump()
    printfn "%i photos, %i students" photos.Length students.Length

    let maxI = photos.Length

    let commaChameleon (x:float) = x.ToString("N0")

    // andMyFriends = total ms elapsed
    let rateMe (andMyFriends:int64) i v =
    let seconds = msToSeconds andMyFriends
    let rate = if seconds > 0 then int64 i / seconds else 0
    sprintf "Finished %s(%.2f%%) %A per second in %i seconds" (commaChameleon v) (float v / float maxI) rate seconds

    let genericComparer (title:string) fStudents fPredicate =
    let timer = System.Diagnostics.Stopwatch.StartNew()
    let mutable i = 0
    let dumpProgress =
    let dc = DumpContainer()
    dc.Dump(title)
    fun (v:int) -> dc.Content <- rateMe timer.ElapsedMilliseconds i v

    let students = fStudents students
    let items =
    photos
    |> Seq.filter(fun photo ->
    if i % 500 = 0 then
    Util.Progress <- i * 100 / maxI
    dumpProgress i
    i <- i + 1

    fPredicate students photo
    )
    |> Array.ofSeq

    timer.Stop()
    dumpProgress i
    title, timer.ElapsedMilliseconds, items.Length

    let hashContains () =
    genericComparer "hashC" Set.ofArray (fun studs photo -> studs |> Set.exists(fun student -> student.Contains photo) |> not)

    let hash2 () =
    genericComparer "hash2" Set.ofArray (fun studs photo -> studs |> Set.contains photo |> not)

    let dic () =
    genericComparer "dic" (fun students -> students |> Seq.map(fun student -> student, student) |> Map.ofSeq) (fun studs photo -> studs |> Map.containsKey photo |> not)
    [
    //seqVersion
    //seqVersion2
    //hashVersion
    hashContains
    hash2
    dic
    ]
    |> List.map (fun f -> async { return f()})
    |> Async.Parallel
    |> Async.RunSynchronously