pguyot · April 10, 2026 18:34
diff --git a/beam_stats.escript b/beam_stats.escript
 #!/usr/bin/env escript
 %% -*- erlang -*-
 %%! +A 4

 -mode(compile).
 -include_lib("kernel/include/file.hrl").

 %% ============================================================================
 %% beam_stats.escript — Scan Erlang/OTP Function Usage
 %%
 %% Scans top GitHub Erlang repos and Hex packages to find the most commonly
 %% called Module:Function/Arity patterns across the ecosystem.
 %% ============================================================================

 -define(STATE_FILE, "beam_stats_state.bin").
 -define(SAVE_INTERVAL, 10).
 -define(GITHUB_PER_PAGE, 100).
 -define(GITHUB_MAX_PER_QUERY, 1000). %% GitHub search API hard limit
 -define(HEX_PER_PAGE, 100).
 -define(HEX_MAX_PAGES, 100). %% up to 10000 hex packages

 main(["--scan", Dir, ResultFile]) ->
    %% Child VM mode: scan directory, write results to file, exit.
    %% Each invocation gets its own atom table.
    application:ensure_started(compiler),
    application:ensure_started(syntax_tools),
    Stats = do_scan_directory(Dir),
    file:write_file(ResultFile, term_to_binary(Stats, [compressed])),
    halt(0);
 main(Args) ->
    case parse_args(Args) of
        {error, Msg} ->
            io:format(standard_error, "Error: ~s~n", [Msg]),
            usage(),
            halt(1);
        Opts ->
            run(Opts)
    end.

 %% ============================================================================
 %% Arg parsing
 %% ============================================================================

 parse_args(Args) ->
    parse_args(Args, default_opts()).

 default_opts() ->
    #{
        workers => 4,
        github => true,
        hex => true,
        output => "beam_stats.csv",
        top => 100,
        limit => infinity,
        resume => false
    }.

 parse_args([], Opts) ->
    Opts;
 parse_args(["--workers", N | Rest], Opts) ->
    case catch list_to_integer(N) of
        V when is_integer(V), V > 0 ->
            parse_args(Rest, Opts#{workers => V});
        _ ->
            {error, "Invalid --workers value: " ++ N}
    end;
 parse_args(["--github-only" | Rest], Opts) ->
    parse_args(Rest, Opts#{hex => false});
 parse_args(["--hex-only" | Rest], Opts) ->
    parse_args(Rest, Opts#{github => false});
 parse_args(["--output", File | Rest], Opts) ->
    parse_args(Rest, Opts#{output => File});
 parse_args(["--top", N | Rest], Opts) ->
    case catch list_to_integer(N) of
        V when is_integer(V), V > 0 ->
            parse_args(Rest, Opts#{top => V});
        _ ->
            {error, "Invalid --top value: " ++ N}
    end;
 parse_args(["--limit", N | Rest], Opts) ->
    case catch list_to_integer(N) of
        V when is_integer(V), V > 0 ->
            parse_args(Rest, Opts#{limit => V});
        _ ->
            {error, "Invalid --limit value: " ++ N}
    end;
 parse_args(["--resume" | Rest], Opts) ->
    parse_args(Rest, Opts#{resume => true});
 parse_args(["--help" | _], _Opts) ->
    usage(),
    halt(0);
 parse_args([Unknown | _], _Opts) ->
    {error, "Unknown option: " ++ Unknown}.

 usage() ->
    io:format(
        "Usage: beam_stats.escript [OPTIONS]~n"
        "~n"
        "Options:~n"
        "  --workers N      Number of parallel workers (default: 4)~n"
        "  --github-only    Only scan GitHub repos~n"
        "  --hex-only       Only scan Hex packages~n"
        "  --output FILE    Output CSV file (default: beam_stats.csv)~n"
        "  --top N          Show top N results in terminal (default: 100)~n"
        "  --limit N        Max number of repos/packages to scan~n"
        "  --resume         Resume from saved state~n"
        "  --help           Show this help~n"
    ).

 %% ============================================================================
 %% Main run
 %% ============================================================================

 run(Opts) ->
    start_applications(),
    io:format("beam_stats: Scanning Erlang/OTP function usage~n"),
    io:format("Workers: ~p~n~n", [maps:get(workers, Opts)]),

    %% Load or initialize state
    {Scanned, Stats, TotalProcessed} = case maps:get(resume, Opts) of
        true -> load_state();
        false -> {sets:new([{version, 2}]), #{}, 0}
    end,

    %% Fetch work items
    Limit = maps:get(limit, Opts),
    GithubRepos = case maps:get(github, Opts) of
        true -> fetch_github_repos(Limit);
        false -> []
    end,
    HexLeft = case Limit of
        infinity -> infinity;
        _ -> max(0, Limit - length(GithubRepos))
    end,
    HexPackages = case maps:get(hex, Opts) of
        true -> fetch_hex_packages(HexLeft);
        false -> []
    end,

    %% Deduplicate
    {Repos, Packages} = deduplicate(GithubRepos, HexPackages),

    io:format("~nWork items: ~p GitHub repos, ~p Hex packages~n",
              [length(Repos), length(Packages)]),

    %% Build work list: [{Type, Item}, ...]
    Work0 = [{github, R} || R <- Repos] ++ [{hex, P} || P <- Packages],

    %% Filter already scanned
    Work = lists:filter(fun({Type, Item}) ->
        Key = work_key(Type, Item),
        not sets:is_element(Key, Scanned)
    end, Work0),

    io:format("Items to scan: ~p (skipping ~p already scanned)~n~n",
              [length(Work), length(Work0) - length(Work)]),

    %% Run coordinator
    FinalStats = run_coordinator(Work, Scanned, Stats, TotalProcessed, Opts),

    %% Output
    OutputFile = maps:get(output, Opts),
    write_csv(OutputFile, FinalStats),
    Top = maps:get(top, Opts),
    print_summary(Top, FinalStats),
    io:format("~nResults written to ~s~n", [OutputFile]).

 %% ============================================================================
 %% Applications
 %% ============================================================================

 start_applications() ->
    case application:ensure_all_started(inets) of
        {ok, _} -> ok;
        _ -> ok
    end,
    case application:ensure_all_started(ssl) of
        {ok, _} -> ok;
        _ -> ok
    end,
    %% Configure httpc
    httpc:set_options([{max_sessions, 8}, {max_keep_alive_length, 16}]),
    ok.

 %% ============================================================================
 %% GitHub repo fetching via gh CLI
 %% ============================================================================

 fetch_github_repos(Limit) ->
    io:format("Fetching GitHub repos...~n"),
    %% Cursor-based pagination using star ranges. Fetch the top 1000 by stars,
    %% then use the lowest star count as the upper bound for the next batch.
    %% This bypasses GitHub's 1000-result-per-query limit with minimal API calls.
    Max = case Limit of
        infinity -> ?GITHUB_MAX_PER_QUERY * 15;  %% ~15000 repos max
        _ -> Limit
    end,
    Repos = fetch_github_cursor(infinity, [], Max),
    io:format("  Total: ~p GitHub repos~n", [length(Repos)]),
    Repos.

 fetch_github_cursor(_HiStars, Acc, Max) when length(Acc) >= Max ->
    lists:sublist(Acc, Max);
 fetch_github_cursor(HiStars, Acc, Max) ->
    Range = case HiStars of
        infinity -> ">=1";
        N -> lists:flatten(io_lib:format("1..~p", [N]))
    end,
    Remaining = Max - length(Acc),
    Fetch = min(Remaining, ?GITHUB_MAX_PER_QUERY),
    io:format("  stars:~s ...", [Range]),
    {Repos, TotalCount} = fetch_github_query(Range, Fetch),
    io:format(" ~p repos (of ~p available)~n", [length(Repos), TotalCount]),
    case Repos of
        [] ->
            Acc;
        _ ->
            NewAcc = Acc ++ Repos,
            MinStars = lists:min([maps:get(stars, R) || R <- Repos]),
            NextHi = MinStars - 1,
            case length(NewAcc) >= Max of
                true ->
                    lists:sublist(NewAcc, Max);
                false when NextHi < 1 ->
                    %% Can't go lower than 1 star
                    NewAcc;
                false when HiStars =/= infinity, NextHi >= HiStars ->
                    %% No progress (all repos have same star count), stop
                    NewAcc;
                false ->
                    fetch_github_cursor(NextHi, NewAcc, Max)
            end
    end.

 %% Fetch up to Max repos for a single query (max 1000).
 %% Returns {Repos, TotalCount}.
 fetch_github_query(StarRange, Max) ->
    Query = lists:flatten("language:Erlang stars:" ++ StarRange),
    Limit = min(Max, ?GITHUB_MAX_PER_QUERY),
    fetch_github_pages(Query, 1, [], Limit, 0).

 fetch_github_pages(_Query, _Page, Acc, Max, TC) when length(Acc) >= Max ->
    {lists:sublist(lists:reverse(Acc), Max), TC};
 fetch_github_pages(_Query, Page, Acc, _Max, TC) when Page > (?GITHUB_MAX_PER_QUERY div ?GITHUB_PER_PAGE) ->
    {lists:reverse(Acc), TC};
 fetch_github_pages(Query, Page, Acc, Max, TC) ->
    Cmd = io_lib:format(
        "gh api search/repositories "
        "-X GET "
        "-f q='~s' "
        "-f sort='stars' "
        "-f order='desc' "
        "-f per_page='~p' "
        "-f page='~p' "
        "2>/dev/null",
        [Query, ?GITHUB_PER_PAGE, Page]
    ),
    case os:cmd(lists:flatten(Cmd)) of
        [] ->
            {lists:reverse(Acc), TC};
        Output ->
            case catch json:decode(unicode:characters_to_binary(Output)) of
                #{<<"total_count">> := NewTC, <<"items">> := Items} when is_list(Items), length(Items) > 0 ->
                    Repos = lists:map(fun(Item) ->
                        #{
                            full_name => binary_to_list(maps:get(<<"full_name">>, Item)),
                            clone_url => binary_to_list(maps:get(<<"clone_url">>, Item)),
                            html_url => binary_to_list(maps:get(<<"html_url">>, Item)),
                            stars => maps:get(<<"stargazers_count">>, Item, 0)
                        }
                    end, Items),
                    fetch_github_pages(Query, Page + 1, lists:reverse(Repos) ++ Acc, Max, NewTC);
                _ ->
                    {lists:reverse(Acc), TC}
            end
    end.

 %% ============================================================================
 %% Hex package fetching via httpc
 %% ============================================================================

 fetch_hex_packages(Limit) ->
    Max = case Limit of
        infinity -> ?HEX_MAX_PAGES * ?HEX_PER_PAGE;
        _ -> min(Limit, ?HEX_MAX_PAGES * ?HEX_PER_PAGE)
    end,
    io:format("Fetching Hex packages (up to ~p)...~n", [Max]),
    fetch_hex_pages(1, [], Max).

 fetch_hex_pages(Page, Acc, Max) when Page > ?HEX_MAX_PAGES; length(Acc) >= Max ->
    Packages = lists:sublist(lists:reverse(Acc), Max),
    io:format("  Found ~p Hex packages~n", [length(Packages)]),
    Packages;
 fetch_hex_pages(Page, Acc, Max) ->
    Url = io_lib:format(
        "https://hex.pm/api/packages?sort=total_downloads&per_page=~p&page=~p",
        [?HEX_PER_PAGE, Page]
    ),
    case http_get(lists:flatten(Url)) of
        {ok, Body} ->
            case catch json:decode(Body) of
                Items when is_list(Items), length(Items) > 0 ->
                    Packages = lists:filtermap(fun(Item) ->
                        Name = binary_to_list(maps:get(<<"name">>, Item, <<>>)),
                        Meta = maps:get(<<"meta">>, Item, #{}),
                        Links = maps:get(<<"links">>, Meta, #{}),
                        GithubUrl = find_github_link(Links),
                        Releases = maps:get(<<"releases">>, Item, []),
                        LatestVersion = case Releases of
                            [#{<<"version">> := V} | _] -> binary_to_list(V);
                            _ -> ""
                        end,
                        Downloads = maps:get(<<"downloads">>, Item, #{}),
                        TotalDownloads = case Downloads of
                            #{<<"all">> := D} -> D;
                            _ -> 0
                        end,
                        case LatestVersion of
                            "" -> false;
                            _ ->
                                {true, #{
                                    name => Name,
                                    version => LatestVersion,
                                    github_url => GithubUrl,
                                    downloads => TotalDownloads
                                }}
                        end
                    end, Items),
                    io:format("  Page ~p: ~p packages~n", [Page, length(Packages)]),
                    fetch_hex_pages(Page + 1, lists:reverse(Packages) ++ Acc, Max);
                Items when is_list(Items), Items =:= [] ->
                    io:format("  Page ~p: no more results~n", [Page]),
                    lists:reverse(Acc);
                Error ->
                    io:format("  Page ~p: JSON decode error: ~p~n", [Page, Error]),
                    lists:reverse(Acc)
            end;
        {error, Reason} ->
            io:format("  Page ~p: HTTP error: ~p~n", [Page, Reason]),
            lists:reverse(Acc)
    end.

 find_github_link(Links) when is_map(Links) ->
    maps:fold(fun(_Key, Value, Acc) ->
        case Acc of
            "" ->
                Url = binary_to_list(Value),
                case string:find(Url, "github.com") of
                    nomatch -> "";
                    _ -> Url
                end;
            _ -> Acc
        end
    end, "", Links);
 find_github_link(_) ->
    "".

 %% ============================================================================
 %% Deduplication
 %% ============================================================================

 deduplicate(GithubRepos, HexPackages) ->
    %% Build set of normalized GitHub URLs from repos
    GithubUrls = sets:from_list([
        normalize_github_url(maps:get(html_url, R)) || R <- GithubRepos
    ], [{version, 2}]),

    %% Filter hex packages whose GitHub link matches an already-scanned repo
    FilteredHex = lists:filter(fun(P) ->
        case maps:get(github_url, P) of
            "" -> true;
            Url ->
                Normalized = normalize_github_url(Url),
                not sets:is_element(Normalized, GithubUrls)
        end
    end, HexPackages),

    {GithubRepos, FilteredHex}.

 normalize_github_url(Url) ->
    %% Strip protocol, trailing .git, trailing slashes
    Url1 = re:replace(Url, "^https?://", "", [{return, list}]),
    Url2 = re:replace(Url1, "\\.git$", "", [{return, list}]),
    Url3 = string:trim(Url2, trailing, "/"),
    string:lowercase(Url3).

 %% ============================================================================
 %% Coordinator + Worker Pool
 %% ============================================================================

 run_coordinator(Work, Scanned, Stats, TotalProcessed, Opts) ->
    NumWorkers = maps:get(workers, Opts),
    TotalWork = length(Work) + TotalProcessed,
    Self = self(),
    CoordPid = spawn_link(fun() ->
        coordinator_loop(#{
            work => Work,
            scanned => Scanned,
            stats => Stats,
            total_processed => TotalProcessed,
            total_work => TotalWork,
            since_save => 0,
            active_workers => NumWorkers,
            parent => Self
        })
    end),
    %% Spawn workers
    lists:foreach(fun(_) ->
        spawn_link(fun() -> worker_loop(CoordPid) end)
    end, lists:seq(1, NumWorkers)),

    %% Wait for coordinator to finish
    receive
        {coordinator_done, FinalStats} -> FinalStats
    end.

 coordinator_loop(State) ->
    receive
        {get_work, WorkerPid} ->
            case maps:get(work, State) of
                [] ->
                    WorkerPid ! no_more_work,
                    coordinator_loop(State);
                [Item | Rest] ->
                    WorkerPid ! {work, Item},
                    coordinator_loop(State#{work => Rest})
            end;
        {result, Key, RepoStats} ->
            #{
                scanned := Scanned,
                stats := Stats,
                total_processed := TP,
                total_work := TW,
                since_save := SS
            } = State,
            NewScanned = sets:add_element(Key, Scanned),
            NewStats = merge_repo_stats(RepoStats, Stats),
            NewTP = TP + 1,
            NewSS = SS + 1,
            io:format("\r  Progress: ~p/~p (~.1f%)    ",
                      [NewTP, TW, NewTP / max(1, TW) * 100]),
            %% Save state periodically
            NewSS2 = case NewSS >= ?SAVE_INTERVAL of
                true ->
                    save_state(NewScanned, NewStats, NewTP),
                    0;
                false ->
                    NewSS
            end,
            coordinator_loop(State#{
                scanned => NewScanned,
                stats => NewStats,
                total_processed => NewTP,
                since_save => NewSS2
            });
        {worker_done, _WorkerPid} ->
            #{
                active_workers := AW,
                stats := Stats,
                scanned := Scanned,
                total_processed := TP,
                parent := Parent
            } = State,
            NewAW = AW - 1,
            case NewAW of
                0 ->
                    %% All workers done — save final state and report
                    save_state(Scanned, Stats, TP),
                    io:format("~n"),
                    Parent ! {coordinator_done, Stats};
                _ ->
                    coordinator_loop(State#{active_workers => NewAW})
            end
    end.

 worker_loop(CoordPid) ->
    CoordPid ! {get_work, self()},
    receive
        {work, {Type, Item}} ->
            Key = work_key(Type, Item),
            RepoStats = try
                case Type of
                    github -> process_github_repo(Item);
                    hex -> process_hex_package(Item)
                end
            catch
                _:Reason ->
                    io:format("~n  Error processing ~s: ~p~n", [Key, Reason]),
                    #{}
            end,
            CoordPid ! {result, Key, RepoStats},
            worker_loop(CoordPid);
        no_more_work ->
            CoordPid ! {worker_done, self()},
            ok
    end.

 work_key(github, #{full_name := Name}) ->
    "github:" ++ Name;
 work_key(hex, #{name := Name}) ->
    "hex:" ++ Name.

 %% ============================================================================
 %% Process GitHub repo
 %% ============================================================================

 process_github_repo(Repo) ->
    CloneUrl = maps:get(clone_url, Repo),
    FullName = maps:get(full_name, Repo),
    TmpDir = make_temp_dir("gh_"),
    try
        %% Shallow clone
        Cmd = io_lib:format(
            "git clone --depth 1 --quiet '~s' '~s' 2>/dev/null",
            [CloneUrl, TmpDir]
        ),
        case os:cmd(lists:flatten(Cmd)) of
            _ ->
                case filelib:is_dir(TmpDir) of
                    true ->
                        scan_directory(TmpDir, FullName);
                    false ->
                        #{}
                end
        end
    after
        rm_rf(TmpDir)
    end.

 %% ============================================================================
 %% Process Hex package
 %% ============================================================================

 process_hex_package(Package) ->
    Name = maps:get(name, Package),
    Version = maps:get(version, Package),
    Url = lists:flatten(io_lib:format(
        "https://repo.hex.pm/tarballs/~s-~s.tar",
        [Name, Version]
    )),
    case http_get_binary(Url) of
        {ok, TarBin} ->
            process_hex_tarball(TarBin, Name);
        {error, Reason} ->
            io:format("~n  Failed to download ~s-~s: ~p~n", [Name, Version, Reason]),
            #{}
    end.

 process_hex_tarball(TarBin, Name) ->
    %% Extract outer tar from binary
    case erl_tar:extract({binary, TarBin}, [memory]) of
        {ok, OuterFiles} ->
            %% Find contents.tar.gz
            case lists:keyfind("contents.tar.gz", 1, OuterFiles) of
                {"contents.tar.gz", ContentsTarGz} ->
                    %% Check if there are .erl files before extracting
                    case erl_tar:table({binary, ContentsTarGz}, [compressed]) of
                        {ok, FileList} ->
                            HasErl = lists:any(fun(F) ->
                                filename:extension(F) =:= ".erl"
                            end, FileList),
                            case HasErl of
                                true ->
                                    extract_and_scan_hex(ContentsTarGz, Name);
                                false ->
                                    #{}
                            end;
                        _ ->
                            #{}
                    end;
                false ->
                    #{}
            end;
        {error, Reason} ->
            io:format("~n  Failed to extract ~s tarball: ~p~n", [Name, Reason]),
            #{}
    end.

 extract_and_scan_hex(ContentsTarGz, Name) ->
    TmpDir = make_temp_dir("hex_"),
    try
        case erl_tar:extract({binary, ContentsTarGz}, [{cwd, TmpDir}, compressed]) of
            ok ->
                scan_directory(TmpDir, "hex:" ++ Name);
            {error, Reason} ->
                io:format("~n  Failed to extract ~s contents: ~p~n", [Name, Reason]),
                #{}
        end
    after
        rm_rf(TmpDir)
    end.

 %% ============================================================================
 %% Directory scanning — spawns a separate VM to avoid atom table exhaustion
 %% ============================================================================

 scan_directory(Dir, _Label) ->
    Script = escript:script_name(),
    ResultFile = filename:join(Dir, ".beam_stats_result.bin"),
    Cmd = lists:flatten(io_lib:format(
        "escript '~s' --scan '~s' '~s'",
        [Script, Dir, ResultFile]
    )),
    os:cmd(Cmd),
    case file:read_file(ResultFile) of
        {ok, Bin} ->
            file:delete(ResultFile),
            try binary_to_term(Bin)
            catch _:_ -> #{}
            end;
        {error, _} ->
            #{}
    end.

 %% ============================================================================
 %% Scanning code (runs in child VM via --scan)
 %% ============================================================================

 do_scan_directory(Dir) ->
    ErlFiles = find_erl_files(Dir),
    lists:foldl(fun(File, Acc) ->
        case parse_file(File) of
            {ok, Calls} ->
                merge_file_calls(Calls, Acc);
            {error, _} ->
                Acc
        end
    end, #{}, ErlFiles).

 %% filelib:fold_files follows symlinks and can loop forever on repos
 %% with circular symlinks (e.g. _checkouts/dep -> ../../..).
 %% This version skips symlinks.
 find_erl_files(Dir) ->
    find_erl_files(Dir, []).

 find_erl_files(Dir, Acc) ->
    case file:list_dir(Dir) of
        {ok, Entries} ->
            lists:foldl(fun(Entry, A) ->
                Path = filename:join(Dir, Entry),
                case file:read_link_info(Path) of
                    {ok, #file_info{type = directory}} ->
                        find_erl_files(Path, A);
                    {ok, #file_info{type = regular}} ->
                        case filename:extension(Entry) of
                            ".erl" -> [Path | A];
                            _ -> A
                        end;
                    _ ->
                        %% symlink or other — skip
                        A
                end
            end, Acc, Entries);
        {error, _} ->
            Acc
    end.

 parse_file(File) ->
    try
        case epp_dodger:parse_file(File) of
            {ok, Forms} ->
                Calls = lists:foldl(fun(Form, Acc) ->
                    extract_calls(Form, Acc)
                end, #{}, Forms),
                {ok, Calls};
            {error, Reason} ->
                {error, Reason}
        end
    catch
        _:Err ->
            {error, Err}
    end.

 extract_calls(Form, Acc) ->
    erl_syntax_lib:fold(fun(Node, A) ->
        case erl_syntax:type(Node) of
            application ->
                extract_application_call(Node, A);
            implicit_fun ->
                extract_implicit_fun(Node, A);
            _ ->
                A
        end
    end, Acc, Form).

 extract_application_call(Node, Acc) ->
    Op = erl_syntax:application_operator(Node),
    Args = erl_syntax:application_arguments(Node),
    Arity = length(Args),
    case erl_syntax:type(Op) of
        module_qualifier ->
            ModNode = erl_syntax:module_qualifier_argument(Op),
            FunNode = erl_syntax:module_qualifier_body(Op),
            case {erl_syntax:type(ModNode), erl_syntax:type(FunNode)} of
                {atom, atom} ->
                    Mod = erl_syntax:atom_value(ModNode),
                    Fun = erl_syntax:atom_value(FunNode),
                    Key = {Mod, Fun, Arity},
                    maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc);
                _ ->
                    Acc
            end;
        atom ->
            Fun = erl_syntax:atom_value(Op),
            case erl_internal:bif(Fun, Arity) of
                true ->
                    Key = {erlang, Fun, Arity},
                    maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc);
                false ->
                    Acc
            end;
        _ ->
            Acc
    end.

 extract_implicit_fun(Node, Acc) ->
    Name = erl_syntax:implicit_fun_name(Node),
    case erl_syntax:type(Name) of
        module_qualifier ->
            ModNode = erl_syntax:module_qualifier_argument(Name),
            Body = erl_syntax:module_qualifier_body(Name),
            case erl_syntax:type(Body) of
                arity_qualifier ->
                    FunNode = erl_syntax:arity_qualifier_body(Body),
                    ArityNode = erl_syntax:arity_qualifier_argument(Body),
                    case {erl_syntax:type(ModNode), erl_syntax:type(FunNode), erl_syntax:type(ArityNode)} of
                        {atom, atom, integer} ->
                            Mod = erl_syntax:atom_value(ModNode),
                            Fun = erl_syntax:atom_value(FunNode),
                            Arity = erl_syntax:integer_value(ArityNode),
                            Key = {Mod, Fun, Arity},
                            maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc);
                        _ ->
                            Acc
                    end;
                _ ->
                    Acc
            end;
        _ ->
            Acc
    end.

 merge_file_calls(FileCalls, RepoAcc) ->
    maps:fold(fun(Key, Count, Acc) ->
        maps:update_with(Key, fun(V) -> V + Count end, Count, Acc)
    end, RepoAcc, FileCalls).

 %% ============================================================================
 %% Stats merging
 %% ============================================================================

 merge_repo_stats(RepoStats, GlobalStats) ->
    %% For each MFA in the repo stats, increment total_calls and repo_count
    maps:fold(fun(Key, CallCount, Acc) ->
        maps:update_with(Key, fun({TC, RC}) -> {TC + CallCount, RC + 1} end,
                         {CallCount, 1}, Acc)
    end, GlobalStats, RepoStats).

 %% ============================================================================
 %% State persistence
 %% ============================================================================

 save_state(Scanned, Stats, TotalProcessed) ->
    State = {beam_stats_v1, Scanned, Stats, TotalProcessed},
    TmpFile = ?STATE_FILE ++ ".tmp",
    ok = file:write_file(TmpFile, term_to_binary(State, [compressed])),
    ok = file:rename(TmpFile, ?STATE_FILE).

 load_state() ->
    case file:read_file(?STATE_FILE) of
        {ok, Bin} ->
            case catch binary_to_term(Bin) of
                {beam_stats_v1, Scanned, Stats, TotalProcessed} ->
                    io:format("Resumed state: ~p items already scanned~n", [TotalProcessed]),
                    {Scanned, Stats, TotalProcessed};
                _ ->
                    io:format("Warning: Invalid state file, starting fresh~n"),
                    {sets:new([{version, 2}]), #{}, 0}
            end;
        {error, enoent} ->
            io:format("No state file found, starting fresh~n"),
            {sets:new([{version, 2}]), #{}, 0};
        {error, Reason} ->
            io:format("Warning: Could not read state file (~p), starting fresh~n", [Reason]),
            {sets:new([{version, 2}]), #{}, 0}
    end.

 %% ============================================================================
 %% CSV output
 %% ============================================================================

 write_csv(File, Stats) ->
    Sorted = sort_stats(Stats),
    {ok, Fd} = file:open(File, [write, {encoding, utf8}]),
    io:format(Fd, "module,function,arity,total_calls,repo_count~n", []),
    lists:foreach(fun({{Mod, Fun, Arity}, {TotalCalls, RepoCount}}) ->
        io:format(Fd, "~ts,~ts,~p,~p,~p~n", [Mod, Fun, Arity, TotalCalls, RepoCount])
    end, Sorted),
    file:close(Fd).

 %% ============================================================================
 %% Terminal summary
 %% ============================================================================

 print_summary(Top, Stats) ->
    Sorted = sort_stats(Stats),
    TopN = lists:sublist(Sorted, Top),
    io:format("~n"),
    io:format("~s~n", [string:copies("=", 78)]),
    io:format("  Top ~p Most Used Erlang/OTP Functions~n", [min(Top, length(TopN))]),
    io:format("~s~n", [string:copies("=", 78)]),
    io:format("~4s  ~-40s ~10s ~10s~n", ["#", "Module:Function/Arity", "Repos", "Calls"]),
    io:format("~s~n", [string:copies("-", 78)]),
    lists:foldl(fun({{Mod, Fun, Arity}, {TotalCalls, RepoCount}}, Idx) ->
        MFA = io_lib:format("~ts:~ts/~p", [Mod, Fun, Arity]),
        io:format("~4p  ~-40ts ~10p ~10p~n", [Idx, lists:flatten(MFA), RepoCount, TotalCalls]),
        Idx + 1
    end, 1, TopN),
    io:format("~s~n", [string:copies("=", 78)]),
    io:format("Total unique MFAs: ~p~n", [maps:size(Stats)]).

 sort_stats(Stats) ->
    List = maps:to_list(Stats),
    lists:sort(fun({_, {_, RC1}}, {_, {_, RC2}}) -> RC1 > RC2 end, List).

 %% ============================================================================
 %% HTTP helpers
 %% ============================================================================

 http_get(Url) ->
    case httpc:request(get, {Url, [{"user-agent", "beam_stats/1.0"}]},
                       [{timeout, 30000}, {connect_timeout, 10000},
                        {ssl, [{verify, verify_none}]}],
                       [{body_format, binary}]) of
        {ok, {{_, 200, _}, _, Body}} ->
            {ok, Body};
        {ok, {{_, Code, _}, _, _}} ->
            {error, {http_status, Code}};
        {error, Reason} ->
            {error, Reason}
    end.

 http_get_binary(Url) ->
    http_get(Url).

 %% ============================================================================
 %% Temp dir / cleanup helpers
 %% ============================================================================

 make_temp_dir(Prefix) ->
    Rand = integer_to_list(erlang:unique_integer([positive])),
    Dir = filename:join(
        filename:basedir(user_cache, "beam_stats"),
        Prefix ++ Rand
    ),
    ok = filelib:ensure_dir(filename:join(Dir, "dummy")),
    Dir.

 rm_rf(Dir) ->
    os:cmd(lists:flatten(io_lib:format("rm -rf '~s'", [Dir]))).
diff --git a/beam_stats_filter.escript b/beam_stats_filter.escript
 #!/usr/bin/env escript
 %% -*- erlang -*-

 -mode(compile).

 main([CsvFile, MinReposStr]) ->
    case catch list_to_integer(MinReposStr) of
        MinRepos when is_integer(MinRepos), MinRepos > 0 ->
            run(CsvFile, MinRepos);
        _ ->
            io:format(standard_error, "Error: invalid min_repos value: ~s~n", [MinReposStr]),
            usage(),
            halt(1)
    end;
 main(_) ->
    usage(),
    halt(1).

 usage() ->
    io:format(
        "Usage: beam_stats_filter.escript <csv_file> <min_repos>~n"
        "~n"
        "Extracts functions by OTP module used by at least <min_repos> repos.~n"
        "Only includes modules that exist in the running OTP installation.~n"
    ).

 run(CsvFile, MinRepos) ->
    {ok, Bin} = file:read_file(CsvFile),
    [_Header | DataLines] = string:split(binary_to_list(Bin), "\n", all),
    Rows = parse_rows(DataLines),

    %% Filter: repo_count >= MinRepos and module is an OTP module
    Filtered = lists:filter(fun({Mod, _Fun, _Arity, _Calls, RepoCount}) ->
        RepoCount >= MinRepos andalso is_otp_module(Mod)
    end, Rows),

    %% Group by module
    ByModule = lists:foldl(fun({Mod, Fun, Arity, Calls, RC}, Acc) ->
        maps:update_with(Mod, fun(L) -> [{Fun, Arity, Calls, RC} | L] end,
                         [{Fun, Arity, Calls, RC}], Acc)
    end, #{}, Filtered),

    %% Sort modules alphabetically
    Modules = lists:sort(maps:to_list(ByModule)),

    %% Print
    TotalFuns = lists:sum([length(Funs) || {_, Funs} <- Modules]),
    io:format("OTP functions used by >= ~p repos: ~p functions across ~p modules~n~n",
              [MinRepos, TotalFuns, length(Modules)]),
    lists:foreach(fun({Mod, Funs}) ->
        %% Sort functions by repo_count descending
        Sorted = lists:sort(fun({_, _, _, RC1}, {_, _, _, RC2}) -> RC1 > RC2 end, Funs),
        io:format("~ts (~p functions):~n", [Mod, length(Sorted)]),
        lists:foreach(fun({Fun, Arity, _Calls, RC}) ->
            io:format("  ~ts/~p  (~p repos)~n", [Fun, Arity, RC])
        end, Sorted),
        io:format("~n")
    end, Modules).

 parse_rows(Lines) ->
    lists:filtermap(fun(Line) ->
        case string:trim(Line) of
            "" -> false;
            Trimmed ->
                case string:split(Trimmed, ",", all) of
                    [ModStr, FunStr, ArityStr, CallsStr, RCStr] ->
                        {true, {ModStr, FunStr,
                                list_to_integer(ArityStr),
                                list_to_integer(CallsStr),
                                list_to_integer(RCStr)}};
                    _ -> false
                end
        end
    end, Lines).

 is_otp_module(ModStr) ->
    try
        Mod = list_to_existing_atom(ModStr),
        case code:which(Mod) of
            non_existing -> false;
            preloaded -> true;
            cover_compiled -> true;
            Path when is_list(Path) ->
                %% Check it's under the OTP lib dir
                OtpRoot = code:root_dir(),
                lists:prefix(OtpRoot, Path)
        end
    catch
        error:badarg ->
            %% Atom doesn't exist — module was never loaded/referenced
            false
    end.
	#!/usr/bin/env escript
	%% -- erlang --

	-mode(compile).

	main([CsvFile, MinReposStr]) ->
	case catch list_to_integer(MinReposStr) of
	MinRepos when is_integer(MinRepos), MinRepos > 0 ->
	run(CsvFile, MinRepos);
	_ ->
	io:format(standard_error, "Error: invalid min_repos value: ~s~n", [MinReposStr]),
	usage(),
	halt(1)
	end;
	main(_) ->
	usage(),
	halt(1).

	usage() ->
	io:format(
	"Usage: beam_stats_filter.escript <csv_file> <min_repos>~n"
	"~n"
	"Extracts functions by OTP module used by at least <min_repos> repos.~n"
	"Only includes modules that exist in the running OTP installation.~n"
	).

	run(CsvFile, MinRepos) ->
	{ok, Bin} = file:read_file(CsvFile),
	[_Header \| DataLines] = string:split(binary_to_list(Bin), "\n", all),
	Rows = parse_rows(DataLines),

	%% Filter: repo_count >= MinRepos and module is an OTP module
	Filtered = lists:filter(fun({Mod, _Fun, _Arity, _Calls, RepoCount}) ->
	RepoCount >= MinRepos andalso is_otp_module(Mod)
	end, Rows),

	%% Group by module
	ByModule = lists:foldl(fun({Mod, Fun, Arity, Calls, RC}, Acc) ->
	maps:update_with(Mod, fun(L) -> [{Fun, Arity, Calls, RC} \| L] end,
	[{Fun, Arity, Calls, RC}], Acc)
	end, #{}, Filtered),

	%% Sort modules alphabetically
	Modules = lists:sort(maps:to_list(ByModule)),

	%% Print
	TotalFuns = lists:sum([length(Funs) \|\| {_, Funs} <- Modules]),
	io:format("OTP functions used by >= ~p repos: ~p functions across ~p modules~n~n",
	[MinRepos, TotalFuns, length(Modules)]),
	lists:foreach(fun({Mod, Funs}) ->
	%% Sort functions by repo_count descending
	Sorted = lists:sort(fun({_, _, _, RC1}, {_, _, _, RC2}) -> RC1 > RC2 end, Funs),
	io:format("~ts (~p functions):~n", [Mod, length(Sorted)]),
	lists:foreach(fun({Fun, Arity, _Calls, RC}) ->
	io:format(" ~ts/~p (~p repos)~n", [Fun, Arity, RC])
	end, Sorted),
	io:format("~n")
	end, Modules).

	parse_rows(Lines) ->
	lists:filtermap(fun(Line) ->
	case string:trim(Line) of
	"" -> false;
	Trimmed ->
	case string:split(Trimmed, ",", all) of
	[ModStr, FunStr, ArityStr, CallsStr, RCStr] ->
	{true, {ModStr, FunStr,
	list_to_integer(ArityStr),
	list_to_integer(CallsStr),
	list_to_integer(RCStr)}};
	_ -> false
	end
	end
	end, Lines).

	is_otp_module(ModStr) ->
	try
	Mod = list_to_existing_atom(ModStr),
	case code:which(Mod) of
	non_existing -> false;
	preloaded -> true;
	cover_compiled -> true;
	Path when is_list(Path) ->
	%% Check it's under the OTP lib dir
	OtpRoot = code:root_dir(),
	lists:prefix(OtpRoot, Path)
	end
	catch
	error:badarg ->
	%% Atom doesn't exist — module was never loaded/referenced
	false
	end.
No results found