Last active
February 27, 2026 06:41
-
-
Save pguyot/da327972f1ecdb7041c97addd4e76bb5 to your computer and use it in GitHub Desktop.
Script to scan GitHub and Hex packages for statistics on function usage
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env escript | |
| %% -*- erlang -*- | |
| %%! +A 4 | |
| -mode(compile). | |
| -include_lib("kernel/include/file.hrl"). | |
| %% ============================================================================ | |
| %% beam_stats.escript — Scan Erlang/OTP Function Usage | |
| %% | |
| %% Scans top GitHub Erlang repos and Hex packages to find the most commonly | |
| %% called Module:Function/Arity patterns across the ecosystem. | |
| %% ============================================================================ | |
| -define(STATE_FILE, "beam_stats_state.bin"). | |
| -define(SAVE_INTERVAL, 10). | |
| -define(GITHUB_PER_PAGE, 100). | |
| -define(GITHUB_MAX_PER_QUERY, 1000). %% GitHub search API hard limit | |
| -define(HEX_PER_PAGE, 100). | |
| -define(HEX_MAX_PAGES, 100). %% up to 10000 hex packages | |
| main(["--scan", Dir, ResultFile]) -> | |
| %% Child VM mode: scan directory, write results to file, exit. | |
| %% Each invocation gets its own atom table. | |
| application:ensure_started(compiler), | |
| application:ensure_started(syntax_tools), | |
| Stats = do_scan_directory(Dir), | |
| file:write_file(ResultFile, term_to_binary(Stats, [compressed])), | |
| halt(0); | |
| main(Args) -> | |
| case parse_args(Args) of | |
| {error, Msg} -> | |
| io:format(standard_error, "Error: ~s~n", [Msg]), | |
| usage(), | |
| halt(1); | |
| Opts -> | |
| run(Opts) | |
| end. | |
| %% ============================================================================ | |
| %% Arg parsing | |
| %% ============================================================================ | |
| parse_args(Args) -> | |
| parse_args(Args, default_opts()). | |
| default_opts() -> | |
| #{ | |
| workers => 4, | |
| github => true, | |
| hex => true, | |
| output => "beam_stats.csv", | |
| top => 100, | |
| limit => infinity, | |
| resume => false | |
| }. | |
| parse_args([], Opts) -> | |
| Opts; | |
| parse_args(["--workers", N | Rest], Opts) -> | |
| case catch list_to_integer(N) of | |
| V when is_integer(V), V > 0 -> | |
| parse_args(Rest, Opts#{workers => V}); | |
| _ -> | |
| {error, "Invalid --workers value: " ++ N} | |
| end; | |
| parse_args(["--github-only" | Rest], Opts) -> | |
| parse_args(Rest, Opts#{hex => false}); | |
| parse_args(["--hex-only" | Rest], Opts) -> | |
| parse_args(Rest, Opts#{github => false}); | |
| parse_args(["--output", File | Rest], Opts) -> | |
| parse_args(Rest, Opts#{output => File}); | |
| parse_args(["--top", N | Rest], Opts) -> | |
| case catch list_to_integer(N) of | |
| V when is_integer(V), V > 0 -> | |
| parse_args(Rest, Opts#{top => V}); | |
| _ -> | |
| {error, "Invalid --top value: " ++ N} | |
| end; | |
| parse_args(["--limit", N | Rest], Opts) -> | |
| case catch list_to_integer(N) of | |
| V when is_integer(V), V > 0 -> | |
| parse_args(Rest, Opts#{limit => V}); | |
| _ -> | |
| {error, "Invalid --limit value: " ++ N} | |
| end; | |
| parse_args(["--resume" | Rest], Opts) -> | |
| parse_args(Rest, Opts#{resume => true}); | |
| parse_args(["--help" | _], _Opts) -> | |
| usage(), | |
| halt(0); | |
| parse_args([Unknown | _], _Opts) -> | |
| {error, "Unknown option: " ++ Unknown}. | |
| usage() -> | |
| io:format( | |
| "Usage: beam_stats.escript [OPTIONS]~n" | |
| "~n" | |
| "Options:~n" | |
| " --workers N Number of parallel workers (default: 4)~n" | |
| " --github-only Only scan GitHub repos~n" | |
| " --hex-only Only scan Hex packages~n" | |
| " --output FILE Output CSV file (default: beam_stats.csv)~n" | |
| " --top N Show top N results in terminal (default: 100)~n" | |
| " --limit N Max number of repos/packages to scan~n" | |
| " --resume Resume from saved state~n" | |
| " --help Show this help~n" | |
| ). | |
| %% ============================================================================ | |
| %% Main run | |
| %% ============================================================================ | |
| run(Opts) -> | |
| start_applications(), | |
| io:format("beam_stats: Scanning Erlang/OTP function usage~n"), | |
| io:format("Workers: ~p~n~n", [maps:get(workers, Opts)]), | |
| %% Load or initialize state | |
| {Scanned, Stats, TotalProcessed} = case maps:get(resume, Opts) of | |
| true -> load_state(); | |
| false -> {sets:new([{version, 2}]), #{}, 0} | |
| end, | |
| %% Fetch work items | |
| Limit = maps:get(limit, Opts), | |
| GithubRepos = case maps:get(github, Opts) of | |
| true -> fetch_github_repos(Limit); | |
| false -> [] | |
| end, | |
| HexLeft = case Limit of | |
| infinity -> infinity; | |
| _ -> max(0, Limit - length(GithubRepos)) | |
| end, | |
| HexPackages = case maps:get(hex, Opts) of | |
| true -> fetch_hex_packages(HexLeft); | |
| false -> [] | |
| end, | |
| %% Deduplicate | |
| {Repos, Packages} = deduplicate(GithubRepos, HexPackages), | |
| io:format("~nWork items: ~p GitHub repos, ~p Hex packages~n", | |
| [length(Repos), length(Packages)]), | |
| %% Build work list: [{Type, Item}, ...] | |
| Work0 = [{github, R} || R <- Repos] ++ [{hex, P} || P <- Packages], | |
| %% Filter already scanned | |
| Work = lists:filter(fun({Type, Item}) -> | |
| Key = work_key(Type, Item), | |
| not sets:is_element(Key, Scanned) | |
| end, Work0), | |
| io:format("Items to scan: ~p (skipping ~p already scanned)~n~n", | |
| [length(Work), length(Work0) - length(Work)]), | |
| %% Run coordinator | |
| FinalStats = run_coordinator(Work, Scanned, Stats, TotalProcessed, Opts), | |
| %% Output | |
| OutputFile = maps:get(output, Opts), | |
| write_csv(OutputFile, FinalStats), | |
| Top = maps:get(top, Opts), | |
| print_summary(Top, FinalStats), | |
| io:format("~nResults written to ~s~n", [OutputFile]). | |
| %% ============================================================================ | |
| %% Applications | |
| %% ============================================================================ | |
| start_applications() -> | |
| case application:ensure_all_started(inets) of | |
| {ok, _} -> ok; | |
| _ -> ok | |
| end, | |
| case application:ensure_all_started(ssl) of | |
| {ok, _} -> ok; | |
| _ -> ok | |
| end, | |
| %% Configure httpc | |
| httpc:set_options([{max_sessions, 8}, {max_keep_alive_length, 16}]), | |
| ok. | |
| %% ============================================================================ | |
| %% GitHub repo fetching via gh CLI | |
| %% ============================================================================ | |
| fetch_github_repos(Limit) -> | |
| io:format("Fetching GitHub repos...~n"), | |
| %% Cursor-based pagination using star ranges. Fetch the top 1000 by stars, | |
| %% then use the lowest star count as the upper bound for the next batch. | |
| %% This bypasses GitHub's 1000-result-per-query limit with minimal API calls. | |
| Max = case Limit of | |
| infinity -> ?GITHUB_MAX_PER_QUERY * 15; %% ~15000 repos max | |
| _ -> Limit | |
| end, | |
| Repos = fetch_github_cursor(infinity, [], Max), | |
| io:format(" Total: ~p GitHub repos~n", [length(Repos)]), | |
| Repos. | |
| fetch_github_cursor(_HiStars, Acc, Max) when length(Acc) >= Max -> | |
| lists:sublist(Acc, Max); | |
| fetch_github_cursor(HiStars, Acc, Max) -> | |
| Range = case HiStars of | |
| infinity -> ">=1"; | |
| N -> lists:flatten(io_lib:format("1..~p", [N])) | |
| end, | |
| Remaining = Max - length(Acc), | |
| Fetch = min(Remaining, ?GITHUB_MAX_PER_QUERY), | |
| io:format(" stars:~s ...", [Range]), | |
| {Repos, TotalCount} = fetch_github_query(Range, Fetch), | |
| io:format(" ~p repos (of ~p available)~n", [length(Repos), TotalCount]), | |
| case Repos of | |
| [] -> | |
| Acc; | |
| _ -> | |
| NewAcc = Acc ++ Repos, | |
| MinStars = lists:min([maps:get(stars, R) || R <- Repos]), | |
| NextHi = MinStars - 1, | |
| case length(NewAcc) >= Max of | |
| true -> | |
| lists:sublist(NewAcc, Max); | |
| false when NextHi < 1 -> | |
| %% Can't go lower than 1 star | |
| NewAcc; | |
| false when HiStars =/= infinity, NextHi >= HiStars -> | |
| %% No progress (all repos have same star count), stop | |
| NewAcc; | |
| false -> | |
| fetch_github_cursor(NextHi, NewAcc, Max) | |
| end | |
| end. | |
| %% Fetch up to Max repos for a single query (max 1000). | |
| %% Returns {Repos, TotalCount}. | |
| fetch_github_query(StarRange, Max) -> | |
| Query = lists:flatten("language:Erlang stars:" ++ StarRange), | |
| Limit = min(Max, ?GITHUB_MAX_PER_QUERY), | |
| fetch_github_pages(Query, 1, [], Limit, 0). | |
| fetch_github_pages(_Query, _Page, Acc, Max, TC) when length(Acc) >= Max -> | |
| {lists:sublist(lists:reverse(Acc), Max), TC}; | |
| fetch_github_pages(_Query, Page, Acc, _Max, TC) when Page > (?GITHUB_MAX_PER_QUERY div ?GITHUB_PER_PAGE) -> | |
| {lists:reverse(Acc), TC}; | |
| fetch_github_pages(Query, Page, Acc, Max, TC) -> | |
| Cmd = io_lib:format( | |
| "gh api search/repositories " | |
| "-X GET " | |
| "-f q='~s' " | |
| "-f sort='stars' " | |
| "-f order='desc' " | |
| "-f per_page='~p' " | |
| "-f page='~p' " | |
| "2>/dev/null", | |
| [Query, ?GITHUB_PER_PAGE, Page] | |
| ), | |
| case os:cmd(lists:flatten(Cmd)) of | |
| [] -> | |
| {lists:reverse(Acc), TC}; | |
| Output -> | |
| case catch json:decode(unicode:characters_to_binary(Output)) of | |
| #{<<"total_count">> := NewTC, <<"items">> := Items} when is_list(Items), length(Items) > 0 -> | |
| Repos = lists:map(fun(Item) -> | |
| #{ | |
| full_name => binary_to_list(maps:get(<<"full_name">>, Item)), | |
| clone_url => binary_to_list(maps:get(<<"clone_url">>, Item)), | |
| html_url => binary_to_list(maps:get(<<"html_url">>, Item)), | |
| stars => maps:get(<<"stargazers_count">>, Item, 0) | |
| } | |
| end, Items), | |
| fetch_github_pages(Query, Page + 1, lists:reverse(Repos) ++ Acc, Max, NewTC); | |
| _ -> | |
| {lists:reverse(Acc), TC} | |
| end | |
| end. | |
| %% ============================================================================ | |
| %% Hex package fetching via httpc | |
| %% ============================================================================ | |
| fetch_hex_packages(Limit) -> | |
| Max = case Limit of | |
| infinity -> ?HEX_MAX_PAGES * ?HEX_PER_PAGE; | |
| _ -> min(Limit, ?HEX_MAX_PAGES * ?HEX_PER_PAGE) | |
| end, | |
| io:format("Fetching Hex packages (up to ~p)...~n", [Max]), | |
| fetch_hex_pages(1, [], Max). | |
| fetch_hex_pages(Page, Acc, Max) when Page > ?HEX_MAX_PAGES; length(Acc) >= Max -> | |
| Packages = lists:sublist(lists:reverse(Acc), Max), | |
| io:format(" Found ~p Hex packages~n", [length(Packages)]), | |
| Packages; | |
| fetch_hex_pages(Page, Acc, Max) -> | |
| Url = io_lib:format( | |
| "https://hex.pm/api/packages?sort=total_downloads&per_page=~p&page=~p", | |
| [?HEX_PER_PAGE, Page] | |
| ), | |
| case http_get(lists:flatten(Url)) of | |
| {ok, Body} -> | |
| case catch json:decode(Body) of | |
| Items when is_list(Items), length(Items) > 0 -> | |
| Packages = lists:filtermap(fun(Item) -> | |
| Name = binary_to_list(maps:get(<<"name">>, Item, <<>>)), | |
| Meta = maps:get(<<"meta">>, Item, #{}), | |
| Links = maps:get(<<"links">>, Meta, #{}), | |
| GithubUrl = find_github_link(Links), | |
| Releases = maps:get(<<"releases">>, Item, []), | |
| LatestVersion = case Releases of | |
| [#{<<"version">> := V} | _] -> binary_to_list(V); | |
| _ -> "" | |
| end, | |
| Downloads = maps:get(<<"downloads">>, Item, #{}), | |
| TotalDownloads = case Downloads of | |
| #{<<"all">> := D} -> D; | |
| _ -> 0 | |
| end, | |
| case LatestVersion of | |
| "" -> false; | |
| _ -> | |
| {true, #{ | |
| name => Name, | |
| version => LatestVersion, | |
| github_url => GithubUrl, | |
| downloads => TotalDownloads | |
| }} | |
| end | |
| end, Items), | |
| io:format(" Page ~p: ~p packages~n", [Page, length(Packages)]), | |
| fetch_hex_pages(Page + 1, lists:reverse(Packages) ++ Acc, Max); | |
| Items when is_list(Items), Items =:= [] -> | |
| io:format(" Page ~p: no more results~n", [Page]), | |
| lists:reverse(Acc); | |
| Error -> | |
| io:format(" Page ~p: JSON decode error: ~p~n", [Page, Error]), | |
| lists:reverse(Acc) | |
| end; | |
| {error, Reason} -> | |
| io:format(" Page ~p: HTTP error: ~p~n", [Page, Reason]), | |
| lists:reverse(Acc) | |
| end. | |
| find_github_link(Links) when is_map(Links) -> | |
| maps:fold(fun(_Key, Value, Acc) -> | |
| case Acc of | |
| "" -> | |
| Url = binary_to_list(Value), | |
| case string:find(Url, "github.com") of | |
| nomatch -> ""; | |
| _ -> Url | |
| end; | |
| _ -> Acc | |
| end | |
| end, "", Links); | |
| find_github_link(_) -> | |
| "". | |
| %% ============================================================================ | |
| %% Deduplication | |
| %% ============================================================================ | |
| deduplicate(GithubRepos, HexPackages) -> | |
| %% Build set of normalized GitHub URLs from repos | |
| GithubUrls = sets:from_list([ | |
| normalize_github_url(maps:get(html_url, R)) || R <- GithubRepos | |
| ], [{version, 2}]), | |
| %% Filter hex packages whose GitHub link matches an already-scanned repo | |
| FilteredHex = lists:filter(fun(P) -> | |
| case maps:get(github_url, P) of | |
| "" -> true; | |
| Url -> | |
| Normalized = normalize_github_url(Url), | |
| not sets:is_element(Normalized, GithubUrls) | |
| end | |
| end, HexPackages), | |
| {GithubRepos, FilteredHex}. | |
| normalize_github_url(Url) -> | |
| %% Strip protocol, trailing .git, trailing slashes | |
| Url1 = re:replace(Url, "^https?://", "", [{return, list}]), | |
| Url2 = re:replace(Url1, "\\.git$", "", [{return, list}]), | |
| Url3 = string:trim(Url2, trailing, "/"), | |
| string:lowercase(Url3). | |
| %% ============================================================================ | |
| %% Coordinator + Worker Pool | |
| %% ============================================================================ | |
| run_coordinator(Work, Scanned, Stats, TotalProcessed, Opts) -> | |
| NumWorkers = maps:get(workers, Opts), | |
| TotalWork = length(Work) + TotalProcessed, | |
| Self = self(), | |
| CoordPid = spawn_link(fun() -> | |
| coordinator_loop(#{ | |
| work => Work, | |
| scanned => Scanned, | |
| stats => Stats, | |
| total_processed => TotalProcessed, | |
| total_work => TotalWork, | |
| since_save => 0, | |
| active_workers => NumWorkers, | |
| parent => Self | |
| }) | |
| end), | |
| %% Spawn workers | |
| lists:foreach(fun(_) -> | |
| spawn_link(fun() -> worker_loop(CoordPid) end) | |
| end, lists:seq(1, NumWorkers)), | |
| %% Wait for coordinator to finish | |
| receive | |
| {coordinator_done, FinalStats} -> FinalStats | |
| end. | |
| coordinator_loop(State) -> | |
| receive | |
| {get_work, WorkerPid} -> | |
| case maps:get(work, State) of | |
| [] -> | |
| WorkerPid ! no_more_work, | |
| coordinator_loop(State); | |
| [Item | Rest] -> | |
| WorkerPid ! {work, Item}, | |
| coordinator_loop(State#{work => Rest}) | |
| end; | |
| {result, Key, RepoStats} -> | |
| #{ | |
| scanned := Scanned, | |
| stats := Stats, | |
| total_processed := TP, | |
| total_work := TW, | |
| since_save := SS | |
| } = State, | |
| NewScanned = sets:add_element(Key, Scanned), | |
| NewStats = merge_repo_stats(RepoStats, Stats), | |
| NewTP = TP + 1, | |
| NewSS = SS + 1, | |
| io:format("\r Progress: ~p/~p (~.1f%) ", | |
| [NewTP, TW, NewTP / max(1, TW) * 100]), | |
| %% Save state periodically | |
| NewSS2 = case NewSS >= ?SAVE_INTERVAL of | |
| true -> | |
| save_state(NewScanned, NewStats, NewTP), | |
| 0; | |
| false -> | |
| NewSS | |
| end, | |
| coordinator_loop(State#{ | |
| scanned => NewScanned, | |
| stats => NewStats, | |
| total_processed => NewTP, | |
| since_save => NewSS2 | |
| }); | |
| {worker_done, _WorkerPid} -> | |
| #{ | |
| active_workers := AW, | |
| stats := Stats, | |
| scanned := Scanned, | |
| total_processed := TP, | |
| parent := Parent | |
| } = State, | |
| NewAW = AW - 1, | |
| case NewAW of | |
| 0 -> | |
| %% All workers done — save final state and report | |
| save_state(Scanned, Stats, TP), | |
| io:format("~n"), | |
| Parent ! {coordinator_done, Stats}; | |
| _ -> | |
| coordinator_loop(State#{active_workers => NewAW}) | |
| end | |
| end. | |
| worker_loop(CoordPid) -> | |
| CoordPid ! {get_work, self()}, | |
| receive | |
| {work, {Type, Item}} -> | |
| Key = work_key(Type, Item), | |
| RepoStats = try | |
| case Type of | |
| github -> process_github_repo(Item); | |
| hex -> process_hex_package(Item) | |
| end | |
| catch | |
| _:Reason -> | |
| io:format("~n Error processing ~s: ~p~n", [Key, Reason]), | |
| #{} | |
| end, | |
| CoordPid ! {result, Key, RepoStats}, | |
| worker_loop(CoordPid); | |
| no_more_work -> | |
| CoordPid ! {worker_done, self()}, | |
| ok | |
| end. | |
| work_key(github, #{full_name := Name}) -> | |
| "github:" ++ Name; | |
| work_key(hex, #{name := Name}) -> | |
| "hex:" ++ Name. | |
| %% ============================================================================ | |
| %% Process GitHub repo | |
| %% ============================================================================ | |
| process_github_repo(Repo) -> | |
| CloneUrl = maps:get(clone_url, Repo), | |
| FullName = maps:get(full_name, Repo), | |
| TmpDir = make_temp_dir("gh_"), | |
| try | |
| %% Shallow clone | |
| Cmd = io_lib:format( | |
| "git clone --depth 1 --quiet '~s' '~s' 2>/dev/null", | |
| [CloneUrl, TmpDir] | |
| ), | |
| case os:cmd(lists:flatten(Cmd)) of | |
| _ -> | |
| case filelib:is_dir(TmpDir) of | |
| true -> | |
| scan_directory(TmpDir, FullName); | |
| false -> | |
| #{} | |
| end | |
| end | |
| after | |
| rm_rf(TmpDir) | |
| end. | |
| %% ============================================================================ | |
| %% Process Hex package | |
| %% ============================================================================ | |
| process_hex_package(Package) -> | |
| Name = maps:get(name, Package), | |
| Version = maps:get(version, Package), | |
| Url = lists:flatten(io_lib:format( | |
| "https://repo.hex.pm/tarballs/~s-~s.tar", | |
| [Name, Version] | |
| )), | |
| case http_get_binary(Url) of | |
| {ok, TarBin} -> | |
| process_hex_tarball(TarBin, Name); | |
| {error, Reason} -> | |
| io:format("~n Failed to download ~s-~s: ~p~n", [Name, Version, Reason]), | |
| #{} | |
| end. | |
| process_hex_tarball(TarBin, Name) -> | |
| %% Extract outer tar from binary | |
| case erl_tar:extract({binary, TarBin}, [memory]) of | |
| {ok, OuterFiles} -> | |
| %% Find contents.tar.gz | |
| case lists:keyfind("contents.tar.gz", 1, OuterFiles) of | |
| {"contents.tar.gz", ContentsTarGz} -> | |
| %% Check if there are .erl files before extracting | |
| case erl_tar:table({binary, ContentsTarGz}, [compressed]) of | |
| {ok, FileList} -> | |
| HasErl = lists:any(fun(F) -> | |
| filename:extension(F) =:= ".erl" | |
| end, FileList), | |
| case HasErl of | |
| true -> | |
| extract_and_scan_hex(ContentsTarGz, Name); | |
| false -> | |
| #{} | |
| end; | |
| _ -> | |
| #{} | |
| end; | |
| false -> | |
| #{} | |
| end; | |
| {error, Reason} -> | |
| io:format("~n Failed to extract ~s tarball: ~p~n", [Name, Reason]), | |
| #{} | |
| end. | |
| extract_and_scan_hex(ContentsTarGz, Name) -> | |
| TmpDir = make_temp_dir("hex_"), | |
| try | |
| case erl_tar:extract({binary, ContentsTarGz}, [{cwd, TmpDir}, compressed]) of | |
| ok -> | |
| scan_directory(TmpDir, "hex:" ++ Name); | |
| {error, Reason} -> | |
| io:format("~n Failed to extract ~s contents: ~p~n", [Name, Reason]), | |
| #{} | |
| end | |
| after | |
| rm_rf(TmpDir) | |
| end. | |
| %% ============================================================================ | |
| %% Directory scanning — spawns a separate VM to avoid atom table exhaustion | |
| %% ============================================================================ | |
| scan_directory(Dir, _Label) -> | |
| Script = escript:script_name(), | |
| ResultFile = filename:join(Dir, ".beam_stats_result.bin"), | |
| Cmd = lists:flatten(io_lib:format( | |
| "escript '~s' --scan '~s' '~s'", | |
| [Script, Dir, ResultFile] | |
| )), | |
| os:cmd(Cmd), | |
| case file:read_file(ResultFile) of | |
| {ok, Bin} -> | |
| file:delete(ResultFile), | |
| try binary_to_term(Bin) | |
| catch _:_ -> #{} | |
| end; | |
| {error, _} -> | |
| #{} | |
| end. | |
| %% ============================================================================ | |
| %% Scanning code (runs in child VM via --scan) | |
| %% ============================================================================ | |
| do_scan_directory(Dir) -> | |
| ErlFiles = find_erl_files(Dir), | |
| lists:foldl(fun(File, Acc) -> | |
| case parse_file(File) of | |
| {ok, Calls} -> | |
| merge_file_calls(Calls, Acc); | |
| {error, _} -> | |
| Acc | |
| end | |
| end, #{}, ErlFiles). | |
| %% filelib:fold_files follows symlinks and can loop forever on repos | |
| %% with circular symlinks (e.g. _checkouts/dep -> ../../..). | |
| %% This version skips symlinks. | |
| find_erl_files(Dir) -> | |
| find_erl_files(Dir, []). | |
| find_erl_files(Dir, Acc) -> | |
| case file:list_dir(Dir) of | |
| {ok, Entries} -> | |
| lists:foldl(fun(Entry, A) -> | |
| Path = filename:join(Dir, Entry), | |
| case file:read_link_info(Path) of | |
| {ok, #file_info{type = directory}} -> | |
| find_erl_files(Path, A); | |
| {ok, #file_info{type = regular}} -> | |
| case filename:extension(Entry) of | |
| ".erl" -> [Path | A]; | |
| _ -> A | |
| end; | |
| _ -> | |
| %% symlink or other — skip | |
| A | |
| end | |
| end, Acc, Entries); | |
| {error, _} -> | |
| Acc | |
| end. | |
| parse_file(File) -> | |
| try | |
| case epp_dodger:parse_file(File) of | |
| {ok, Forms} -> | |
| Calls = lists:foldl(fun(Form, Acc) -> | |
| extract_calls(Form, Acc) | |
| end, #{}, Forms), | |
| {ok, Calls}; | |
| {error, Reason} -> | |
| {error, Reason} | |
| end | |
| catch | |
| _:Err -> | |
| {error, Err} | |
| end. | |
| extract_calls(Form, Acc) -> | |
| erl_syntax_lib:fold(fun(Node, A) -> | |
| case erl_syntax:type(Node) of | |
| application -> | |
| extract_application_call(Node, A); | |
| implicit_fun -> | |
| extract_implicit_fun(Node, A); | |
| _ -> | |
| A | |
| end | |
| end, Acc, Form). | |
| extract_application_call(Node, Acc) -> | |
| Op = erl_syntax:application_operator(Node), | |
| Args = erl_syntax:application_arguments(Node), | |
| Arity = length(Args), | |
| case erl_syntax:type(Op) of | |
| module_qualifier -> | |
| ModNode = erl_syntax:module_qualifier_argument(Op), | |
| FunNode = erl_syntax:module_qualifier_body(Op), | |
| case {erl_syntax:type(ModNode), erl_syntax:type(FunNode)} of | |
| {atom, atom} -> | |
| Mod = erl_syntax:atom_value(ModNode), | |
| Fun = erl_syntax:atom_value(FunNode), | |
| Key = {Mod, Fun, Arity}, | |
| maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc); | |
| _ -> | |
| Acc | |
| end; | |
| atom -> | |
| Fun = erl_syntax:atom_value(Op), | |
| case erl_internal:bif(Fun, Arity) of | |
| true -> | |
| Key = {erlang, Fun, Arity}, | |
| maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc); | |
| false -> | |
| Acc | |
| end; | |
| _ -> | |
| Acc | |
| end. | |
| extract_implicit_fun(Node, Acc) -> | |
| Name = erl_syntax:implicit_fun_name(Node), | |
| case erl_syntax:type(Name) of | |
| module_qualifier -> | |
| ModNode = erl_syntax:module_qualifier_argument(Name), | |
| Body = erl_syntax:module_qualifier_body(Name), | |
| case erl_syntax:type(Body) of | |
| arity_qualifier -> | |
| FunNode = erl_syntax:arity_qualifier_body(Body), | |
| ArityNode = erl_syntax:arity_qualifier_argument(Body), | |
| case {erl_syntax:type(ModNode), erl_syntax:type(FunNode), erl_syntax:type(ArityNode)} of | |
| {atom, atom, integer} -> | |
| Mod = erl_syntax:atom_value(ModNode), | |
| Fun = erl_syntax:atom_value(FunNode), | |
| Arity = erl_syntax:integer_value(ArityNode), | |
| Key = {Mod, Fun, Arity}, | |
| maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc); | |
| _ -> | |
| Acc | |
| end; | |
| _ -> | |
| Acc | |
| end; | |
| _ -> | |
| Acc | |
| end. | |
| merge_file_calls(FileCalls, RepoAcc) -> | |
| maps:fold(fun(Key, Count, Acc) -> | |
| maps:update_with(Key, fun(V) -> V + Count end, Count, Acc) | |
| end, RepoAcc, FileCalls). | |
| %% ============================================================================ | |
| %% Stats merging | |
| %% ============================================================================ | |
| merge_repo_stats(RepoStats, GlobalStats) -> | |
| %% For each MFA in the repo stats, increment total_calls and repo_count | |
| maps:fold(fun(Key, CallCount, Acc) -> | |
| maps:update_with(Key, fun({TC, RC}) -> {TC + CallCount, RC + 1} end, | |
| {CallCount, 1}, Acc) | |
| end, GlobalStats, RepoStats). | |
| %% ============================================================================ | |
| %% State persistence | |
| %% ============================================================================ | |
| save_state(Scanned, Stats, TotalProcessed) -> | |
| State = {beam_stats_v1, Scanned, Stats, TotalProcessed}, | |
| TmpFile = ?STATE_FILE ++ ".tmp", | |
| ok = file:write_file(TmpFile, term_to_binary(State, [compressed])), | |
| ok = file:rename(TmpFile, ?STATE_FILE). | |
| load_state() -> | |
| case file:read_file(?STATE_FILE) of | |
| {ok, Bin} -> | |
| case catch binary_to_term(Bin) of | |
| {beam_stats_v1, Scanned, Stats, TotalProcessed} -> | |
| io:format("Resumed state: ~p items already scanned~n", [TotalProcessed]), | |
| {Scanned, Stats, TotalProcessed}; | |
| _ -> | |
| io:format("Warning: Invalid state file, starting fresh~n"), | |
| {sets:new([{version, 2}]), #{}, 0} | |
| end; | |
| {error, enoent} -> | |
| io:format("No state file found, starting fresh~n"), | |
| {sets:new([{version, 2}]), #{}, 0}; | |
| {error, Reason} -> | |
| io:format("Warning: Could not read state file (~p), starting fresh~n", [Reason]), | |
| {sets:new([{version, 2}]), #{}, 0} | |
| end. | |
| %% ============================================================================ | |
| %% CSV output | |
| %% ============================================================================ | |
| write_csv(File, Stats) -> | |
| Sorted = sort_stats(Stats), | |
| {ok, Fd} = file:open(File, [write, {encoding, utf8}]), | |
| io:format(Fd, "module,function,arity,total_calls,repo_count~n", []), | |
| lists:foreach(fun({{Mod, Fun, Arity}, {TotalCalls, RepoCount}}) -> | |
| io:format(Fd, "~ts,~ts,~p,~p,~p~n", [Mod, Fun, Arity, TotalCalls, RepoCount]) | |
| end, Sorted), | |
| file:close(Fd). | |
| %% ============================================================================ | |
| %% Terminal summary | |
| %% ============================================================================ | |
| print_summary(Top, Stats) -> | |
| Sorted = sort_stats(Stats), | |
| TopN = lists:sublist(Sorted, Top), | |
| io:format("~n"), | |
| io:format("~s~n", [string:copies("=", 78)]), | |
| io:format(" Top ~p Most Used Erlang/OTP Functions~n", [min(Top, length(TopN))]), | |
| io:format("~s~n", [string:copies("=", 78)]), | |
| io:format("~4s ~-40s ~10s ~10s~n", ["#", "Module:Function/Arity", "Repos", "Calls"]), | |
| io:format("~s~n", [string:copies("-", 78)]), | |
| lists:foldl(fun({{Mod, Fun, Arity}, {TotalCalls, RepoCount}}, Idx) -> | |
| MFA = io_lib:format("~ts:~ts/~p", [Mod, Fun, Arity]), | |
| io:format("~4p ~-40ts ~10p ~10p~n", [Idx, lists:flatten(MFA), RepoCount, TotalCalls]), | |
| Idx + 1 | |
| end, 1, TopN), | |
| io:format("~s~n", [string:copies("=", 78)]), | |
| io:format("Total unique MFAs: ~p~n", [maps:size(Stats)]). | |
| sort_stats(Stats) -> | |
| List = maps:to_list(Stats), | |
| lists:sort(fun({_, {_, RC1}}, {_, {_, RC2}}) -> RC1 > RC2 end, List). | |
| %% ============================================================================ | |
| %% HTTP helpers | |
| %% ============================================================================ | |
| http_get(Url) -> | |
| case httpc:request(get, {Url, [{"user-agent", "beam_stats/1.0"}]}, | |
| [{timeout, 30000}, {connect_timeout, 10000}, | |
| {ssl, [{verify, verify_none}]}], | |
| [{body_format, binary}]) of | |
| {ok, {{_, 200, _}, _, Body}} -> | |
| {ok, Body}; | |
| {ok, {{_, Code, _}, _, _}} -> | |
| {error, {http_status, Code}}; | |
| {error, Reason} -> | |
| {error, Reason} | |
| end. | |
| http_get_binary(Url) -> | |
| http_get(Url). | |
| %% ============================================================================ | |
| %% Temp dir / cleanup helpers | |
| %% ============================================================================ | |
| make_temp_dir(Prefix) -> | |
| Rand = integer_to_list(erlang:unique_integer([positive])), | |
| Dir = filename:join( | |
| filename:basedir(user_cache, "beam_stats"), | |
| Prefix ++ Rand | |
| ), | |
| ok = filelib:ensure_dir(filename:join(Dir, "dummy")), | |
| Dir. | |
| rm_rf(Dir) -> | |
| os:cmd(lists:flatten(io_lib:format("rm -rf '~s'", [Dir]))). |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env escript | |
| %% -*- erlang -*- | |
| -mode(compile). | |
| main([CsvFile, MinReposStr]) -> | |
| case catch list_to_integer(MinReposStr) of | |
| MinRepos when is_integer(MinRepos), MinRepos > 0 -> | |
| run(CsvFile, MinRepos); | |
| _ -> | |
| io:format(standard_error, "Error: invalid min_repos value: ~s~n", [MinReposStr]), | |
| usage(), | |
| halt(1) | |
| end; | |
| main(_) -> | |
| usage(), | |
| halt(1). | |
| usage() -> | |
| io:format( | |
| "Usage: beam_stats_filter.escript <csv_file> <min_repos>~n" | |
| "~n" | |
| "Extracts functions by OTP module used by at least <min_repos> repos.~n" | |
| "Only includes modules that exist in the running OTP installation.~n" | |
| ). | |
| run(CsvFile, MinRepos) -> | |
| {ok, Bin} = file:read_file(CsvFile), | |
| [_Header | DataLines] = string:split(binary_to_list(Bin), "\n", all), | |
| Rows = parse_rows(DataLines), | |
| %% Filter: repo_count >= MinRepos and module is an OTP module | |
| Filtered = lists:filter(fun({Mod, _Fun, _Arity, _Calls, RepoCount}) -> | |
| RepoCount >= MinRepos andalso is_otp_module(Mod) | |
| end, Rows), | |
| %% Group by module | |
| ByModule = lists:foldl(fun({Mod, Fun, Arity, Calls, RC}, Acc) -> | |
| maps:update_with(Mod, fun(L) -> [{Fun, Arity, Calls, RC} | L] end, | |
| [{Fun, Arity, Calls, RC}], Acc) | |
| end, #{}, Filtered), | |
| %% Sort modules alphabetically | |
| Modules = lists:sort(maps:to_list(ByModule)), | |
| TotalFuns = lists:sum([length(Funs) || {_, Funs} <- Modules]), | |
| io:format("OTP functions used by >= ~p repos: ~p functions across ~p modules~n~n", | |
| [MinRepos, TotalFuns, length(Modules)]), | |
| lists:foreach(fun({Mod, Funs}) -> | |
| %% Sort functions by repo_count descending | |
| Sorted = lists:sort(fun({_, _, _, RC1}, {_, _, _, RC2}) -> RC1 > RC2 end, Funs), | |
| io:format("~ts (~p functions):~n", [Mod, length(Sorted)]), | |
| lists:foreach(fun({Fun, Arity, _Calls, RC}) -> | |
| io:format(" ~ts/~p (~p repos)~n", [Fun, Arity, RC]) | |
| end, Sorted), | |
| io:format("~n") | |
| end, Modules). | |
| parse_rows(Lines) -> | |
| lists:filtermap(fun(Line) -> | |
| case string:trim(Line) of | |
| "" -> false; | |
| Trimmed -> | |
| case string:split(Trimmed, ",", all) of | |
| [ModStr, FunStr, ArityStr, CallsStr, RCStr] -> | |
| {true, {ModStr, FunStr, | |
| list_to_integer(ArityStr), | |
| list_to_integer(CallsStr), | |
| list_to_integer(RCStr)}}; | |
| _ -> false | |
| end | |
| end | |
| end, Lines). | |
| is_otp_module(ModStr) -> | |
| try | |
| Mod = list_to_existing_atom(ModStr), | |
| case code:which(Mod) of | |
| non_existing -> false; | |
| preloaded -> true; | |
| cover_compiled -> true; | |
| Path when is_list(Path) -> | |
| %% Check it's under the OTP lib dir | |
| OtpRoot = code:root_dir(), | |
| lists:prefix(OtpRoot, Path) | |
| end | |
| catch | |
| error:badarg -> | |
| %% Atom doesn't exist — module was never loaded/referenced | |
| false | |
| end. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment