Created
April 18, 2022 19:43
-
-
Save mjbommar/b73d996fa83f5f9f781cef88b1e33fb2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright: Licensio, LLC 2022 | |
# License: AGPL-3.0 | |
import argparse | |
import multiprocessing | |
import os | |
import subprocess | |
import pandas | |
def get_file_list(path: str, recursive: bool = False): | |
""" | |
Return a list of all files in a directory. | |
:param path: path to the directory to get the files from | |
:param recursive: if true, recursively get all files in the directory | |
:return: a generator with file paths | |
""" | |
# switch on recursive arg | |
if args.recursive: | |
for directory_path, _, file_list in os.walk(args.path): | |
for file_name in file_list: | |
if os.path.isfile(os.path.join(directory_path, file_name)): | |
yield os.path.join(directory_path, file_name) | |
else: | |
for p in os.listdir(args.path): | |
if os.path.isfile(os.path.join(args.path, p)): | |
yield os.path.join(args.path, p) | |
def run_nm(file_path: str): | |
""" | |
Run the nm command on a file. | |
:param file_path: path to the file to run the command on. | |
:return: a list of dict entries containing the symbol info | |
""" | |
# run the command | |
p = subprocess.Popen( | |
["nm", "-a", "-A", "--special-syms", "--synthetic", file_path], | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) | |
# get the output | |
stdout_buffer, stderr_buffer = p.communicate() | |
stdout_buffer = stdout_buffer.decode("utf-8") | |
stderr_buffer = stderr_buffer.decode("utf-8") | |
symbol_data = [] | |
for line in stdout_buffer.splitlines(): | |
line_tokens = line.split(":") | |
line_symbol_tokens = line_tokens[-1].split() | |
if len(line_tokens) == 2: | |
if len(line_symbol_tokens) == 2: | |
symbol_data.append( | |
{ | |
"path": line_tokens[0], | |
"object": None, | |
"symbol_type": line_symbol_tokens[0], | |
"symbol_name": line_symbol_tokens[1], | |
} | |
) | |
else: | |
symbol_data.append( | |
{ | |
"path": line_tokens[0], | |
"object": None, | |
"symbol_type": line_symbol_tokens[1], | |
"symbol_name": line_symbol_tokens[2], | |
} | |
) | |
elif len(line_tokens) == 3: | |
if len(line_symbol_tokens) == 2: | |
symbol_data.append( | |
{ | |
"path": line_tokens[0], | |
"object": line_tokens[1], | |
"symbol_type": line_symbol_tokens[0], | |
"symbol_name": line_symbol_tokens[1], | |
} | |
) | |
else: | |
symbol_data.append( | |
{ | |
"path": line_tokens[0], | |
"object": line_tokens[1], | |
"symbol_type": line_symbol_tokens[1], | |
"symbol_name": line_symbol_tokens[2], | |
} | |
) | |
else: | |
print(len(line_tokens)) | |
return symbol_data | |
if __name__ == "__main__": | |
# parse CLI arguments | |
parser = argparse.ArgumentParser(description="List local symbols in binaries") | |
parser.add_argument("path", help="Path to begin search", type=str, default=".") | |
parser.add_argument( | |
"--recursive", | |
help="Whether to search recursively", | |
action="store_true", | |
default=False, | |
) | |
parser.add_argument("--output-path", help="Path to output CSV file", type=str) | |
parser.add_argument( | |
"--output-format", help="Output format: {csv, json}", type=str, default="csv" | |
) | |
args = parser.parse_args() | |
# setup and execute a pool of nm processes | |
pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) | |
pool_results = pool.map(run_nm, get_file_list(args.path, args.recursive)) | |
# reduce map results into single list | |
symbol_data = [] | |
for file_result in pool_results: | |
if len(file_result) > 0: | |
symbol_data.extend(file_result) | |
# store results | |
symbol_df = pandas.DataFrame(symbol_data) | |
if args.output_path: | |
if args.output_format == "csv": | |
symbol_df.to_csv(args.output_path, encoding="utf-8") | |
elif args.output_format == "json": | |
symbol_df.to_json(args.output_path) | |
else: | |
print("Error: Invalid output format: {csv, json}") | |
else: | |
if args.output_format == "csv": | |
symbol_df.to_csv("symbols.csv", encoding="utf-8") | |
elif args.output_format == "json": | |
symbol_df.to_json("symbols.json") | |
# print top 10 | |
print(f"identified {symbol_df.shape[0]} symbols") | |
if symbol_df.shape[0] > 0: | |
print("top 5 files by symbol count:") | |
print(symbol_df["path"].value_counts().head()) | |
print("top 5 symbols by count:") | |
print(symbol_df["symbol_name"].value_counts().head()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment