Skip to content

Instantly share code, notes, and snippets.

@pguyot
Last active February 27, 2026 06:41
Show Gist options
  • Select an option

  • Save pguyot/da327972f1ecdb7041c97addd4e76bb5 to your computer and use it in GitHub Desktop.

Select an option

Save pguyot/da327972f1ecdb7041c97addd4e76bb5 to your computer and use it in GitHub Desktop.
Script to scan GitHub and Hex packages for statistics on function usage
#!/usr/bin/env escript
%% -*- erlang -*-
%%! +A 4
-mode(compile).
-include_lib("kernel/include/file.hrl").
%% ============================================================================
%% beam_stats.escript — Scan Erlang/OTP Function Usage
%%
%% Scans top GitHub Erlang repos and Hex packages to find the most commonly
%% called Module:Function/Arity patterns across the ecosystem.
%% ============================================================================
-define(STATE_FILE, "beam_stats_state.bin").
-define(SAVE_INTERVAL, 10).
-define(GITHUB_PER_PAGE, 100).
-define(GITHUB_MAX_PER_QUERY, 1000). %% GitHub search API hard limit
-define(HEX_PER_PAGE, 100).
-define(HEX_MAX_PAGES, 100). %% up to 10000 hex packages
main(["--scan", Dir, ResultFile]) ->
%% Child VM mode: scan directory, write results to file, exit.
%% Each invocation gets its own atom table.
application:ensure_started(compiler),
application:ensure_started(syntax_tools),
Stats = do_scan_directory(Dir),
file:write_file(ResultFile, term_to_binary(Stats, [compressed])),
halt(0);
main(Args) ->
case parse_args(Args) of
{error, Msg} ->
io:format(standard_error, "Error: ~s~n", [Msg]),
usage(),
halt(1);
Opts ->
run(Opts)
end.
%% ============================================================================
%% Arg parsing
%% ============================================================================
parse_args(Args) ->
parse_args(Args, default_opts()).
default_opts() ->
#{
workers => 4,
github => true,
hex => true,
output => "beam_stats.csv",
top => 100,
limit => infinity,
resume => false
}.
parse_args([], Opts) ->
Opts;
parse_args(["--workers", N | Rest], Opts) ->
case catch list_to_integer(N) of
V when is_integer(V), V > 0 ->
parse_args(Rest, Opts#{workers => V});
_ ->
{error, "Invalid --workers value: " ++ N}
end;
parse_args(["--github-only" | Rest], Opts) ->
parse_args(Rest, Opts#{hex => false});
parse_args(["--hex-only" | Rest], Opts) ->
parse_args(Rest, Opts#{github => false});
parse_args(["--output", File | Rest], Opts) ->
parse_args(Rest, Opts#{output => File});
parse_args(["--top", N | Rest], Opts) ->
case catch list_to_integer(N) of
V when is_integer(V), V > 0 ->
parse_args(Rest, Opts#{top => V});
_ ->
{error, "Invalid --top value: " ++ N}
end;
parse_args(["--limit", N | Rest], Opts) ->
case catch list_to_integer(N) of
V when is_integer(V), V > 0 ->
parse_args(Rest, Opts#{limit => V});
_ ->
{error, "Invalid --limit value: " ++ N}
end;
parse_args(["--resume" | Rest], Opts) ->
parse_args(Rest, Opts#{resume => true});
parse_args(["--help" | _], _Opts) ->
usage(),
halt(0);
parse_args([Unknown | _], _Opts) ->
{error, "Unknown option: " ++ Unknown}.
usage() ->
io:format(
"Usage: beam_stats.escript [OPTIONS]~n"
"~n"
"Options:~n"
" --workers N Number of parallel workers (default: 4)~n"
" --github-only Only scan GitHub repos~n"
" --hex-only Only scan Hex packages~n"
" --output FILE Output CSV file (default: beam_stats.csv)~n"
" --top N Show top N results in terminal (default: 100)~n"
" --limit N Max number of repos/packages to scan~n"
" --resume Resume from saved state~n"
" --help Show this help~n"
).
%% ============================================================================
%% Main run
%% ============================================================================
run(Opts) ->
start_applications(),
io:format("beam_stats: Scanning Erlang/OTP function usage~n"),
io:format("Workers: ~p~n~n", [maps:get(workers, Opts)]),
%% Load or initialize state
{Scanned, Stats, TotalProcessed} = case maps:get(resume, Opts) of
true -> load_state();
false -> {sets:new([{version, 2}]), #{}, 0}
end,
%% Fetch work items
Limit = maps:get(limit, Opts),
GithubRepos = case maps:get(github, Opts) of
true -> fetch_github_repos(Limit);
false -> []
end,
HexLeft = case Limit of
infinity -> infinity;
_ -> max(0, Limit - length(GithubRepos))
end,
HexPackages = case maps:get(hex, Opts) of
true -> fetch_hex_packages(HexLeft);
false -> []
end,
%% Deduplicate
{Repos, Packages} = deduplicate(GithubRepos, HexPackages),
io:format("~nWork items: ~p GitHub repos, ~p Hex packages~n",
[length(Repos), length(Packages)]),
%% Build work list: [{Type, Item}, ...]
Work0 = [{github, R} || R <- Repos] ++ [{hex, P} || P <- Packages],
%% Filter already scanned
Work = lists:filter(fun({Type, Item}) ->
Key = work_key(Type, Item),
not sets:is_element(Key, Scanned)
end, Work0),
io:format("Items to scan: ~p (skipping ~p already scanned)~n~n",
[length(Work), length(Work0) - length(Work)]),
%% Run coordinator
FinalStats = run_coordinator(Work, Scanned, Stats, TotalProcessed, Opts),
%% Output
OutputFile = maps:get(output, Opts),
write_csv(OutputFile, FinalStats),
Top = maps:get(top, Opts),
print_summary(Top, FinalStats),
io:format("~nResults written to ~s~n", [OutputFile]).
%% ============================================================================
%% Applications
%% ============================================================================
start_applications() ->
case application:ensure_all_started(inets) of
{ok, _} -> ok;
_ -> ok
end,
case application:ensure_all_started(ssl) of
{ok, _} -> ok;
_ -> ok
end,
%% Configure httpc
httpc:set_options([{max_sessions, 8}, {max_keep_alive_length, 16}]),
ok.
%% ============================================================================
%% GitHub repo fetching via gh CLI
%% ============================================================================
fetch_github_repos(Limit) ->
io:format("Fetching GitHub repos...~n"),
%% Cursor-based pagination using star ranges. Fetch the top 1000 by stars,
%% then use the lowest star count as the upper bound for the next batch.
%% This bypasses GitHub's 1000-result-per-query limit with minimal API calls.
Max = case Limit of
infinity -> ?GITHUB_MAX_PER_QUERY * 15; %% ~15000 repos max
_ -> Limit
end,
Repos = fetch_github_cursor(infinity, [], Max),
io:format(" Total: ~p GitHub repos~n", [length(Repos)]),
Repos.
fetch_github_cursor(_HiStars, Acc, Max) when length(Acc) >= Max ->
lists:sublist(Acc, Max);
fetch_github_cursor(HiStars, Acc, Max) ->
Range = case HiStars of
infinity -> ">=1";
N -> lists:flatten(io_lib:format("1..~p", [N]))
end,
Remaining = Max - length(Acc),
Fetch = min(Remaining, ?GITHUB_MAX_PER_QUERY),
io:format(" stars:~s ...", [Range]),
{Repos, TotalCount} = fetch_github_query(Range, Fetch),
io:format(" ~p repos (of ~p available)~n", [length(Repos), TotalCount]),
case Repos of
[] ->
Acc;
_ ->
NewAcc = Acc ++ Repos,
MinStars = lists:min([maps:get(stars, R) || R <- Repos]),
NextHi = MinStars - 1,
case length(NewAcc) >= Max of
true ->
lists:sublist(NewAcc, Max);
false when NextHi < 1 ->
%% Can't go lower than 1 star
NewAcc;
false when HiStars =/= infinity, NextHi >= HiStars ->
%% No progress (all repos have same star count), stop
NewAcc;
false ->
fetch_github_cursor(NextHi, NewAcc, Max)
end
end.
%% Fetch up to Max repos for a single query (max 1000).
%% Returns {Repos, TotalCount}.
fetch_github_query(StarRange, Max) ->
Query = lists:flatten("language:Erlang stars:" ++ StarRange),
Limit = min(Max, ?GITHUB_MAX_PER_QUERY),
fetch_github_pages(Query, 1, [], Limit, 0).
fetch_github_pages(_Query, _Page, Acc, Max, TC) when length(Acc) >= Max ->
{lists:sublist(lists:reverse(Acc), Max), TC};
fetch_github_pages(_Query, Page, Acc, _Max, TC) when Page > (?GITHUB_MAX_PER_QUERY div ?GITHUB_PER_PAGE) ->
{lists:reverse(Acc), TC};
fetch_github_pages(Query, Page, Acc, Max, TC) ->
Cmd = io_lib:format(
"gh api search/repositories "
"-X GET "
"-f q='~s' "
"-f sort='stars' "
"-f order='desc' "
"-f per_page='~p' "
"-f page='~p' "
"2>/dev/null",
[Query, ?GITHUB_PER_PAGE, Page]
),
case os:cmd(lists:flatten(Cmd)) of
[] ->
{lists:reverse(Acc), TC};
Output ->
case catch json:decode(unicode:characters_to_binary(Output)) of
#{<<"total_count">> := NewTC, <<"items">> := Items} when is_list(Items), length(Items) > 0 ->
Repos = lists:map(fun(Item) ->
#{
full_name => binary_to_list(maps:get(<<"full_name">>, Item)),
clone_url => binary_to_list(maps:get(<<"clone_url">>, Item)),
html_url => binary_to_list(maps:get(<<"html_url">>, Item)),
stars => maps:get(<<"stargazers_count">>, Item, 0)
}
end, Items),
fetch_github_pages(Query, Page + 1, lists:reverse(Repos) ++ Acc, Max, NewTC);
_ ->
{lists:reverse(Acc), TC}
end
end.
%% ============================================================================
%% Hex package fetching via httpc
%% ============================================================================
fetch_hex_packages(Limit) ->
Max = case Limit of
infinity -> ?HEX_MAX_PAGES * ?HEX_PER_PAGE;
_ -> min(Limit, ?HEX_MAX_PAGES * ?HEX_PER_PAGE)
end,
io:format("Fetching Hex packages (up to ~p)...~n", [Max]),
fetch_hex_pages(1, [], Max).
fetch_hex_pages(Page, Acc, Max) when Page > ?HEX_MAX_PAGES; length(Acc) >= Max ->
Packages = lists:sublist(lists:reverse(Acc), Max),
io:format(" Found ~p Hex packages~n", [length(Packages)]),
Packages;
fetch_hex_pages(Page, Acc, Max) ->
Url = io_lib:format(
"https://hex.pm/api/packages?sort=total_downloads&per_page=~p&page=~p",
[?HEX_PER_PAGE, Page]
),
case http_get(lists:flatten(Url)) of
{ok, Body} ->
case catch json:decode(Body) of
Items when is_list(Items), length(Items) > 0 ->
Packages = lists:filtermap(fun(Item) ->
Name = binary_to_list(maps:get(<<"name">>, Item, <<>>)),
Meta = maps:get(<<"meta">>, Item, #{}),
Links = maps:get(<<"links">>, Meta, #{}),
GithubUrl = find_github_link(Links),
Releases = maps:get(<<"releases">>, Item, []),
LatestVersion = case Releases of
[#{<<"version">> := V} | _] -> binary_to_list(V);
_ -> ""
end,
Downloads = maps:get(<<"downloads">>, Item, #{}),
TotalDownloads = case Downloads of
#{<<"all">> := D} -> D;
_ -> 0
end,
case LatestVersion of
"" -> false;
_ ->
{true, #{
name => Name,
version => LatestVersion,
github_url => GithubUrl,
downloads => TotalDownloads
}}
end
end, Items),
io:format(" Page ~p: ~p packages~n", [Page, length(Packages)]),
fetch_hex_pages(Page + 1, lists:reverse(Packages) ++ Acc, Max);
Items when is_list(Items), Items =:= [] ->
io:format(" Page ~p: no more results~n", [Page]),
lists:reverse(Acc);
Error ->
io:format(" Page ~p: JSON decode error: ~p~n", [Page, Error]),
lists:reverse(Acc)
end;
{error, Reason} ->
io:format(" Page ~p: HTTP error: ~p~n", [Page, Reason]),
lists:reverse(Acc)
end.
find_github_link(Links) when is_map(Links) ->
maps:fold(fun(_Key, Value, Acc) ->
case Acc of
"" ->
Url = binary_to_list(Value),
case string:find(Url, "github.com") of
nomatch -> "";
_ -> Url
end;
_ -> Acc
end
end, "", Links);
find_github_link(_) ->
"".
%% ============================================================================
%% Deduplication
%% ============================================================================
deduplicate(GithubRepos, HexPackages) ->
%% Build set of normalized GitHub URLs from repos
GithubUrls = sets:from_list([
normalize_github_url(maps:get(html_url, R)) || R <- GithubRepos
], [{version, 2}]),
%% Filter hex packages whose GitHub link matches an already-scanned repo
FilteredHex = lists:filter(fun(P) ->
case maps:get(github_url, P) of
"" -> true;
Url ->
Normalized = normalize_github_url(Url),
not sets:is_element(Normalized, GithubUrls)
end
end, HexPackages),
{GithubRepos, FilteredHex}.
normalize_github_url(Url) ->
%% Strip protocol, trailing .git, trailing slashes
Url1 = re:replace(Url, "^https?://", "", [{return, list}]),
Url2 = re:replace(Url1, "\\.git$", "", [{return, list}]),
Url3 = string:trim(Url2, trailing, "/"),
string:lowercase(Url3).
%% ============================================================================
%% Coordinator + Worker Pool
%% ============================================================================
run_coordinator(Work, Scanned, Stats, TotalProcessed, Opts) ->
NumWorkers = maps:get(workers, Opts),
TotalWork = length(Work) + TotalProcessed,
Self = self(),
CoordPid = spawn_link(fun() ->
coordinator_loop(#{
work => Work,
scanned => Scanned,
stats => Stats,
total_processed => TotalProcessed,
total_work => TotalWork,
since_save => 0,
active_workers => NumWorkers,
parent => Self
})
end),
%% Spawn workers
lists:foreach(fun(_) ->
spawn_link(fun() -> worker_loop(CoordPid) end)
end, lists:seq(1, NumWorkers)),
%% Wait for coordinator to finish
receive
{coordinator_done, FinalStats} -> FinalStats
end.
coordinator_loop(State) ->
receive
{get_work, WorkerPid} ->
case maps:get(work, State) of
[] ->
WorkerPid ! no_more_work,
coordinator_loop(State);
[Item | Rest] ->
WorkerPid ! {work, Item},
coordinator_loop(State#{work => Rest})
end;
{result, Key, RepoStats} ->
#{
scanned := Scanned,
stats := Stats,
total_processed := TP,
total_work := TW,
since_save := SS
} = State,
NewScanned = sets:add_element(Key, Scanned),
NewStats = merge_repo_stats(RepoStats, Stats),
NewTP = TP + 1,
NewSS = SS + 1,
io:format("\r Progress: ~p/~p (~.1f%) ",
[NewTP, TW, NewTP / max(1, TW) * 100]),
%% Save state periodically
NewSS2 = case NewSS >= ?SAVE_INTERVAL of
true ->
save_state(NewScanned, NewStats, NewTP),
0;
false ->
NewSS
end,
coordinator_loop(State#{
scanned => NewScanned,
stats => NewStats,
total_processed => NewTP,
since_save => NewSS2
});
{worker_done, _WorkerPid} ->
#{
active_workers := AW,
stats := Stats,
scanned := Scanned,
total_processed := TP,
parent := Parent
} = State,
NewAW = AW - 1,
case NewAW of
0 ->
%% All workers done — save final state and report
save_state(Scanned, Stats, TP),
io:format("~n"),
Parent ! {coordinator_done, Stats};
_ ->
coordinator_loop(State#{active_workers => NewAW})
end
end.
worker_loop(CoordPid) ->
CoordPid ! {get_work, self()},
receive
{work, {Type, Item}} ->
Key = work_key(Type, Item),
RepoStats = try
case Type of
github -> process_github_repo(Item);
hex -> process_hex_package(Item)
end
catch
_:Reason ->
io:format("~n Error processing ~s: ~p~n", [Key, Reason]),
#{}
end,
CoordPid ! {result, Key, RepoStats},
worker_loop(CoordPid);
no_more_work ->
CoordPid ! {worker_done, self()},
ok
end.
work_key(github, #{full_name := Name}) ->
"github:" ++ Name;
work_key(hex, #{name := Name}) ->
"hex:" ++ Name.
%% ============================================================================
%% Process GitHub repo
%% ============================================================================
process_github_repo(Repo) ->
CloneUrl = maps:get(clone_url, Repo),
FullName = maps:get(full_name, Repo),
TmpDir = make_temp_dir("gh_"),
try
%% Shallow clone
Cmd = io_lib:format(
"git clone --depth 1 --quiet '~s' '~s' 2>/dev/null",
[CloneUrl, TmpDir]
),
case os:cmd(lists:flatten(Cmd)) of
_ ->
case filelib:is_dir(TmpDir) of
true ->
scan_directory(TmpDir, FullName);
false ->
#{}
end
end
after
rm_rf(TmpDir)
end.
%% ============================================================================
%% Process Hex package
%% ============================================================================
process_hex_package(Package) ->
Name = maps:get(name, Package),
Version = maps:get(version, Package),
Url = lists:flatten(io_lib:format(
"https://repo.hex.pm/tarballs/~s-~s.tar",
[Name, Version]
)),
case http_get_binary(Url) of
{ok, TarBin} ->
process_hex_tarball(TarBin, Name);
{error, Reason} ->
io:format("~n Failed to download ~s-~s: ~p~n", [Name, Version, Reason]),
#{}
end.
process_hex_tarball(TarBin, Name) ->
%% Extract outer tar from binary
case erl_tar:extract({binary, TarBin}, [memory]) of
{ok, OuterFiles} ->
%% Find contents.tar.gz
case lists:keyfind("contents.tar.gz", 1, OuterFiles) of
{"contents.tar.gz", ContentsTarGz} ->
%% Check if there are .erl files before extracting
case erl_tar:table({binary, ContentsTarGz}, [compressed]) of
{ok, FileList} ->
HasErl = lists:any(fun(F) ->
filename:extension(F) =:= ".erl"
end, FileList),
case HasErl of
true ->
extract_and_scan_hex(ContentsTarGz, Name);
false ->
#{}
end;
_ ->
#{}
end;
false ->
#{}
end;
{error, Reason} ->
io:format("~n Failed to extract ~s tarball: ~p~n", [Name, Reason]),
#{}
end.
extract_and_scan_hex(ContentsTarGz, Name) ->
TmpDir = make_temp_dir("hex_"),
try
case erl_tar:extract({binary, ContentsTarGz}, [{cwd, TmpDir}, compressed]) of
ok ->
scan_directory(TmpDir, "hex:" ++ Name);
{error, Reason} ->
io:format("~n Failed to extract ~s contents: ~p~n", [Name, Reason]),
#{}
end
after
rm_rf(TmpDir)
end.
%% ============================================================================
%% Directory scanning — spawns a separate VM to avoid atom table exhaustion
%% ============================================================================
scan_directory(Dir, _Label) ->
Script = escript:script_name(),
ResultFile = filename:join(Dir, ".beam_stats_result.bin"),
Cmd = lists:flatten(io_lib:format(
"escript '~s' --scan '~s' '~s'",
[Script, Dir, ResultFile]
)),
os:cmd(Cmd),
case file:read_file(ResultFile) of
{ok, Bin} ->
file:delete(ResultFile),
try binary_to_term(Bin)
catch _:_ -> #{}
end;
{error, _} ->
#{}
end.
%% ============================================================================
%% Scanning code (runs in child VM via --scan)
%% ============================================================================
do_scan_directory(Dir) ->
ErlFiles = find_erl_files(Dir),
lists:foldl(fun(File, Acc) ->
case parse_file(File) of
{ok, Calls} ->
merge_file_calls(Calls, Acc);
{error, _} ->
Acc
end
end, #{}, ErlFiles).
%% filelib:fold_files follows symlinks and can loop forever on repos
%% with circular symlinks (e.g. _checkouts/dep -> ../../..).
%% This version skips symlinks.
find_erl_files(Dir) ->
find_erl_files(Dir, []).
find_erl_files(Dir, Acc) ->
case file:list_dir(Dir) of
{ok, Entries} ->
lists:foldl(fun(Entry, A) ->
Path = filename:join(Dir, Entry),
case file:read_link_info(Path) of
{ok, #file_info{type = directory}} ->
find_erl_files(Path, A);
{ok, #file_info{type = regular}} ->
case filename:extension(Entry) of
".erl" -> [Path | A];
_ -> A
end;
_ ->
%% symlink or other — skip
A
end
end, Acc, Entries);
{error, _} ->
Acc
end.
parse_file(File) ->
try
case epp_dodger:parse_file(File) of
{ok, Forms} ->
Calls = lists:foldl(fun(Form, Acc) ->
extract_calls(Form, Acc)
end, #{}, Forms),
{ok, Calls};
{error, Reason} ->
{error, Reason}
end
catch
_:Err ->
{error, Err}
end.
extract_calls(Form, Acc) ->
erl_syntax_lib:fold(fun(Node, A) ->
case erl_syntax:type(Node) of
application ->
extract_application_call(Node, A);
implicit_fun ->
extract_implicit_fun(Node, A);
_ ->
A
end
end, Acc, Form).
extract_application_call(Node, Acc) ->
Op = erl_syntax:application_operator(Node),
Args = erl_syntax:application_arguments(Node),
Arity = length(Args),
case erl_syntax:type(Op) of
module_qualifier ->
ModNode = erl_syntax:module_qualifier_argument(Op),
FunNode = erl_syntax:module_qualifier_body(Op),
case {erl_syntax:type(ModNode), erl_syntax:type(FunNode)} of
{atom, atom} ->
Mod = erl_syntax:atom_value(ModNode),
Fun = erl_syntax:atom_value(FunNode),
Key = {Mod, Fun, Arity},
maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc);
_ ->
Acc
end;
atom ->
Fun = erl_syntax:atom_value(Op),
case erl_internal:bif(Fun, Arity) of
true ->
Key = {erlang, Fun, Arity},
maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc);
false ->
Acc
end;
_ ->
Acc
end.
extract_implicit_fun(Node, Acc) ->
Name = erl_syntax:implicit_fun_name(Node),
case erl_syntax:type(Name) of
module_qualifier ->
ModNode = erl_syntax:module_qualifier_argument(Name),
Body = erl_syntax:module_qualifier_body(Name),
case erl_syntax:type(Body) of
arity_qualifier ->
FunNode = erl_syntax:arity_qualifier_body(Body),
ArityNode = erl_syntax:arity_qualifier_argument(Body),
case {erl_syntax:type(ModNode), erl_syntax:type(FunNode), erl_syntax:type(ArityNode)} of
{atom, atom, integer} ->
Mod = erl_syntax:atom_value(ModNode),
Fun = erl_syntax:atom_value(FunNode),
Arity = erl_syntax:integer_value(ArityNode),
Key = {Mod, Fun, Arity},
maps:update_with(Key, fun(V) -> V + 1 end, 1, Acc);
_ ->
Acc
end;
_ ->
Acc
end;
_ ->
Acc
end.
merge_file_calls(FileCalls, RepoAcc) ->
maps:fold(fun(Key, Count, Acc) ->
maps:update_with(Key, fun(V) -> V + Count end, Count, Acc)
end, RepoAcc, FileCalls).
%% ============================================================================
%% Stats merging
%% ============================================================================
merge_repo_stats(RepoStats, GlobalStats) ->
%% For each MFA in the repo stats, increment total_calls and repo_count
maps:fold(fun(Key, CallCount, Acc) ->
maps:update_with(Key, fun({TC, RC}) -> {TC + CallCount, RC + 1} end,
{CallCount, 1}, Acc)
end, GlobalStats, RepoStats).
%% ============================================================================
%% State persistence
%% ============================================================================
save_state(Scanned, Stats, TotalProcessed) ->
State = {beam_stats_v1, Scanned, Stats, TotalProcessed},
TmpFile = ?STATE_FILE ++ ".tmp",
ok = file:write_file(TmpFile, term_to_binary(State, [compressed])),
ok = file:rename(TmpFile, ?STATE_FILE).
load_state() ->
case file:read_file(?STATE_FILE) of
{ok, Bin} ->
case catch binary_to_term(Bin) of
{beam_stats_v1, Scanned, Stats, TotalProcessed} ->
io:format("Resumed state: ~p items already scanned~n", [TotalProcessed]),
{Scanned, Stats, TotalProcessed};
_ ->
io:format("Warning: Invalid state file, starting fresh~n"),
{sets:new([{version, 2}]), #{}, 0}
end;
{error, enoent} ->
io:format("No state file found, starting fresh~n"),
{sets:new([{version, 2}]), #{}, 0};
{error, Reason} ->
io:format("Warning: Could not read state file (~p), starting fresh~n", [Reason]),
{sets:new([{version, 2}]), #{}, 0}
end.
%% ============================================================================
%% CSV output
%% ============================================================================
write_csv(File, Stats) ->
Sorted = sort_stats(Stats),
{ok, Fd} = file:open(File, [write, {encoding, utf8}]),
io:format(Fd, "module,function,arity,total_calls,repo_count~n", []),
lists:foreach(fun({{Mod, Fun, Arity}, {TotalCalls, RepoCount}}) ->
io:format(Fd, "~ts,~ts,~p,~p,~p~n", [Mod, Fun, Arity, TotalCalls, RepoCount])
end, Sorted),
file:close(Fd).
%% ============================================================================
%% Terminal summary
%% ============================================================================
print_summary(Top, Stats) ->
Sorted = sort_stats(Stats),
TopN = lists:sublist(Sorted, Top),
io:format("~n"),
io:format("~s~n", [string:copies("=", 78)]),
io:format(" Top ~p Most Used Erlang/OTP Functions~n", [min(Top, length(TopN))]),
io:format("~s~n", [string:copies("=", 78)]),
io:format("~4s ~-40s ~10s ~10s~n", ["#", "Module:Function/Arity", "Repos", "Calls"]),
io:format("~s~n", [string:copies("-", 78)]),
lists:foldl(fun({{Mod, Fun, Arity}, {TotalCalls, RepoCount}}, Idx) ->
MFA = io_lib:format("~ts:~ts/~p", [Mod, Fun, Arity]),
io:format("~4p ~-40ts ~10p ~10p~n", [Idx, lists:flatten(MFA), RepoCount, TotalCalls]),
Idx + 1
end, 1, TopN),
io:format("~s~n", [string:copies("=", 78)]),
io:format("Total unique MFAs: ~p~n", [maps:size(Stats)]).
sort_stats(Stats) ->
List = maps:to_list(Stats),
lists:sort(fun({_, {_, RC1}}, {_, {_, RC2}}) -> RC1 > RC2 end, List).
%% ============================================================================
%% HTTP helpers
%% ============================================================================
http_get(Url) ->
case httpc:request(get, {Url, [{"user-agent", "beam_stats/1.0"}]},
[{timeout, 30000}, {connect_timeout, 10000},
{ssl, [{verify, verify_none}]}],
[{body_format, binary}]) of
{ok, {{_, 200, _}, _, Body}} ->
{ok, Body};
{ok, {{_, Code, _}, _, _}} ->
{error, {http_status, Code}};
{error, Reason} ->
{error, Reason}
end.
http_get_binary(Url) ->
http_get(Url).
%% ============================================================================
%% Temp dir / cleanup helpers
%% ============================================================================
make_temp_dir(Prefix) ->
Rand = integer_to_list(erlang:unique_integer([positive])),
Dir = filename:join(
filename:basedir(user_cache, "beam_stats"),
Prefix ++ Rand
),
ok = filelib:ensure_dir(filename:join(Dir, "dummy")),
Dir.
rm_rf(Dir) ->
os:cmd(lists:flatten(io_lib:format("rm -rf '~s'", [Dir]))).
#!/usr/bin/env escript
%% -*- erlang -*-
-mode(compile).
main([CsvFile, MinReposStr]) ->
case catch list_to_integer(MinReposStr) of
MinRepos when is_integer(MinRepos), MinRepos > 0 ->
run(CsvFile, MinRepos);
_ ->
io:format(standard_error, "Error: invalid min_repos value: ~s~n", [MinReposStr]),
usage(),
halt(1)
end;
main(_) ->
usage(),
halt(1).
usage() ->
io:format(
"Usage: beam_stats_filter.escript <csv_file> <min_repos>~n"
"~n"
"Extracts functions by OTP module used by at least <min_repos> repos.~n"
"Only includes modules that exist in the running OTP installation.~n"
).
run(CsvFile, MinRepos) ->
{ok, Bin} = file:read_file(CsvFile),
[_Header | DataLines] = string:split(binary_to_list(Bin), "\n", all),
Rows = parse_rows(DataLines),
%% Filter: repo_count >= MinRepos and module is an OTP module
Filtered = lists:filter(fun({Mod, _Fun, _Arity, _Calls, RepoCount}) ->
RepoCount >= MinRepos andalso is_otp_module(Mod)
end, Rows),
%% Group by module
ByModule = lists:foldl(fun({Mod, Fun, Arity, Calls, RC}, Acc) ->
maps:update_with(Mod, fun(L) -> [{Fun, Arity, Calls, RC} | L] end,
[{Fun, Arity, Calls, RC}], Acc)
end, #{}, Filtered),
%% Sort modules alphabetically
Modules = lists:sort(maps:to_list(ByModule)),
%% Print
TotalFuns = lists:sum([length(Funs) || {_, Funs} <- Modules]),
io:format("OTP functions used by >= ~p repos: ~p functions across ~p modules~n~n",
[MinRepos, TotalFuns, length(Modules)]),
lists:foreach(fun({Mod, Funs}) ->
%% Sort functions by repo_count descending
Sorted = lists:sort(fun({_, _, _, RC1}, {_, _, _, RC2}) -> RC1 > RC2 end, Funs),
io:format("~ts (~p functions):~n", [Mod, length(Sorted)]),
lists:foreach(fun({Fun, Arity, _Calls, RC}) ->
io:format(" ~ts/~p (~p repos)~n", [Fun, Arity, RC])
end, Sorted),
io:format("~n")
end, Modules).
parse_rows(Lines) ->
lists:filtermap(fun(Line) ->
case string:trim(Line) of
"" -> false;
Trimmed ->
case string:split(Trimmed, ",", all) of
[ModStr, FunStr, ArityStr, CallsStr, RCStr] ->
{true, {ModStr, FunStr,
list_to_integer(ArityStr),
list_to_integer(CallsStr),
list_to_integer(RCStr)}};
_ -> false
end
end
end, Lines).
is_otp_module(ModStr) ->
try
Mod = list_to_existing_atom(ModStr),
case code:which(Mod) of
non_existing -> false;
preloaded -> true;
cover_compiled -> true;
Path when is_list(Path) ->
%% Check it's under the OTP lib dir
OtpRoot = code:root_dir(),
lists:prefix(OtpRoot, Path)
end
catch
error:badarg ->
%% Atom doesn't exist — module was never loaded/referenced
false
end.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment