Skip to content

Instantly share code, notes, and snippets.

@muellerberndt
Created September 6, 2025 06:22
Show Gist options
  • Save muellerberndt/514fbe1f199d845329dcc4a8e9561200 to your computer and use it in GitHub Desktop.
Save muellerberndt/514fbe1f199d845329dcc4a8e9561200 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
whitelist-builder: Generate a file whitelist for Hound from an arbitrary codebase.
Heuristic preselection + optional LLM re-ranking. Supports mixed codebases (C/Go/Rust/Web/etc.).
Usage:
python whitelist_builder.py \
--input /path/to/repo \
--limit-loc 20000 \
--output whitelist.txt \
--enable-llm --model gpt-4o-mini
Notes:
- Source API keys first if using LLM: `source API_KEYS.txt`
- Output is a newline-separated whitelist of relative file paths.
"""
from __future__ import annotations
import argparse
import json
import os
import re
import sys
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor, as_completed
import subprocess
import time
from pathlib import Path
from typing import Iterable, List, Tuple, Optional, Set
# --------- Heuristics ---------
BIN_EXT = {
'.png', '.jpg', '.jpeg', '.gif', '.webp', '.svg', '.ico',
'.pdf', '.zip', '.gz', '.tar', '.tgz', '.xz', '.7z', '.rar',
'.woff', '.woff2', '.ttf', '.otf', '.eot', '.mp3', '.mp4', '.mov', '.webm',
}
DEFAULT_EXCLUDE_DIRS = {
'.git', '.hg', '.svn', '.idea', '.vscode',
'node_modules', 'vendor', 'dist', 'build', 'target', 'out', '.next', '.cache',
'__pycache__', '.venv', 'venv', '.tox', '.mypy_cache', 'coverage', 'site-packages',
}
DEFAULT_EXCLUDE_GLOBS = [
'**/*~', '**/*.tmp', '**/*.bak', '**/*.log', '**/*.min.*', '**/*.lock',
]
TEST_HINTS = re.compile(r"(^|/)(tests?|__tests__|testdata|spec|mocks?|fixtures?)(/|$)", re.I)
TEST_FILE_HINTS = re.compile(r"(test|spec|mock|fixture)\b", re.I)
LANG_WEIGHTS = {
# Core languages
'.c': 1.0, '.h': 0.9, '.cpp': 1.0, '.hpp': 0.9, '.cc': 1.0,
'.go': 1.2, '.rs': 1.2, '.sol': 1.3,
'.ts': 1.0, '.tsx': 1.0, '.js': 0.9, '.jsx': 0.9,
'.py': 1.0, '.java': 0.9, '.cs': 0.8,
# Config-ish (keep but with lower weight)
'.toml': 0.5, '.json': 0.4, '.yml': 0.5, '.yaml': 0.5, '.ini': 0.4, '.env': 0.4,
'.md': 0.1,
}
PATH_BOOSTS = [
(re.compile(r"(^|/)(src|lib|pkg|internal|core|server|api|cmd)(/|$)", re.I), 0.6),
(re.compile(r"(^|/)(contracts?)(/|$)", re.I), 0.8),
(re.compile(r"(^|/)(app|router|handler|controller|service|models?)(/|$)", re.I), 0.4),
]
PATH_PENALTIES = [
(TEST_HINTS, -1.5),
(re.compile(r"(^|/)(examples?|samples?|demos?|docs?)(/|$)", re.I), -0.6),
(re.compile(r"(^|/)(scripts?|ci|\.github|doc)(/|$)", re.I), -0.3),
]
CONFIG_TOP = {
'Dockerfile', 'docker-compose.yml', 'Makefile',
'Cargo.toml', 'Cargo.lock', 'go.mod', 'go.sum',
'package.json', 'pnpm-lock.yaml', 'yarn.lock',
}
ENTRYPOINT_HINTS = [
re.compile(r"(^|/)cmd/[^/]+/main\.go$", re.I),
re.compile(r"(^|/)main\.go$", re.I),
re.compile(r"(^|/)src/main\.rs$", re.I),
re.compile(r"(^|/)src/main\.(ts|tsx|js|jsx)$", re.I),
]
@dataclass
class FileInfo:
path: Path
rel: str
ext: str
loc: int
score: float
reasons: List[str]
def is_binary(path: Path) -> bool:
if path.suffix.lower() in BIN_EXT:
return True
try:
with open(path, 'rb') as f:
chunk = f.read(1024)
if b'\0' in chunk:
return True
# heuristic: too many non-text bytes
text_chars = sum(c >= 9 and c <= 127 or c in (9, 10, 13) for c in chunk)
return (text_chars / max(1, len(chunk))) < 0.80
except Exception:
return True
def count_loc(path: Path) -> int:
try:
with open(path, 'r', encoding='utf-8', errors='ignore') as f:
return sum(1 for _ in f)
except Exception:
return 0
def score_file(root: Path, path: Path) -> Tuple[float, List[str]]:
"""Return (score, reasons) for a file."""
reasons: List[str] = []
rel = str(path.relative_to(root))
ext = path.suffix.lower()
score = 0.0
# Language weight
lw = LANG_WEIGHTS.get(ext, 0.2)
score += lw
reasons.append(f"lang:{ext}:{lw}")
# Path boosts / penalties
for rx, w in PATH_BOOSTS:
if rx.search(rel):
score += w
reasons.append(f"boost:{w}")
for rx, w in PATH_PENALTIES:
if rx.search(rel):
score += w
reasons.append(f"penalty:{w}")
# Tests or mock-like files get penalized extra
base = path.name
if TEST_FILE_HINTS.search(base):
score -= 0.8
reasons.append("testname:-0.8")
# Entrypoints boost
for rx in ENTRYPOINT_HINTS:
if rx.search(rel):
score += 0.7
reasons.append("entry:+0.7")
break
# Config top-level files: moderate boost
if base in CONFIG_TOP and path.parent == root:
score += 0.5
reasons.append("config:+0.5")
return score, reasons
def iter_candidates(root: Path, only_ext: Optional[Set[str]] = None) -> Iterable[Path]:
for p in root.rglob('*'):
if p.is_dir():
# Skip excluded directories
if p.name in DEFAULT_EXCLUDE_DIRS:
# prune by skipping children
continue
# rely on rglob to iterate
elif p.is_file():
rel = str(p.relative_to(root))
# Exclude hidden files (mostly)
if any(seg.startswith('.') and seg not in {'.env'} for seg in p.parts):
continue
# Exclude default globs
if any(Path(rel).match(g) for g in DEFAULT_EXCLUDE_GLOBS):
continue
# Ignore obvious binaries
if is_binary(p):
continue
# Only certain extensions if requested
if only_ext is not None:
if p.suffix.lower() not in only_ext:
continue
yield p
def _fmt_eta(seconds: float) -> str:
if seconds <= 0 or seconds != seconds:
return "?s"
m, s = divmod(int(seconds), 60)
if m:
return f"{m}m{s:02d}s"
return f"{s}s"
def build_candidates(root: Path, workers: int = 8, verbose: bool = False, progress: bool = True, only_ext: Optional[Set[str]] = None) -> List[FileInfo]:
start = time.time()
paths = list(iter_candidates(root, only_ext=only_ext))
if verbose:
print(f"[scan] Found {len(paths)} text-like files; counting LOC and scoring (workers={workers})…")
items: List[FileInfo] = []
with ThreadPoolExecutor(max_workers=max(2, workers)) as ex:
futs = {ex.submit(count_loc, p): p for p in paths}
total = len(futs)
done = 0
last_report = 0.0
for fut in as_completed(futs):
p = futs[fut]
try:
loc = fut.result()
except Exception:
loc = 0
if loc <= 0:
continue
score, reasons = score_file(root, p)
items.append(FileInfo(
path=p,
rel=str(p.relative_to(root)),
ext=p.suffix.lower(),
loc=loc,
score=score,
reasons=reasons,
))
done += 1
if progress:
now = time.time()
if now - last_report > 0.5:
elapsed = now - start
# Use processed rate to estimate remaining
eta = (elapsed / max(1, done)) * max(0, total - done)
pct = int(done * 100 / max(1, total))
sys.stdout.write(f"\r[scan] {done}/{total} ({pct}%) counted — elapsed {int(elapsed)}s, eta {_fmt_eta(eta)} ")
sys.stdout.flush()
last_report = now
if progress and total:
# Clear progress line
sys.stdout.write("\r" + " " * 80 + "\r")
sys.stdout.flush()
if verbose:
print(f"[scan] Built {len(items)} candidates in {time.time()-start:.1f}s")
return items
def heuristic_rank(items: List[FileInfo]) -> List[FileInfo]:
# Higher score first; within same score, prefer mid-sized files (avoid single huge file dominating)
return sorted(items, key=lambda x: (x.score, min(x.loc, 5000)), reverse=True)
def llm_rerank(items: List[FileInfo], model: str = 'gpt-4o-mini', max_items: int = 300, verbose: bool = False) -> List[str]:
"""Use OpenAI to rerank candidate files; returns list of relpaths in prioritized order."""
try:
from openai import OpenAI # type: ignore
except Exception as e:
print(f"[LLM] OpenAI library not available: {e}")
return [it.rel for it in items]
client = OpenAI()
# Prepare compact JSON describing candidates
sample = [
{
'rel': it.rel,
'loc': it.loc,
'score': round(it.score, 3),
'ext': it.ext,
'reasons': it.reasons[:4],
}
for it in items[:max_items]
]
system = (
"You prioritize source files for a security audit whitelist. "
"Goal: choose the most security-relevant files across the project: core logic, state, auth, interfaces, and important configs. "
"Avoid tests/mocks/fixtures unless they contain unique logic. Prefer spread across critical components. Respond JSON only."
)
user = json.dumps({
'candidates': sample,
'instructions': (
'Return JSON with a single field "prioritized", an array of relpaths, in priority order. '
'Do not include any other fields.'
)
}, indent=2)
try:
t0 = time.time()
resp = client.chat.completions.create(
model=model,
temperature=0.1,
response_format={"type": "json_object"},
messages=[
{"role": "system", "content": system},
{"role": "user", "content": user},
],
)
if verbose:
print(f"[llm] Rerank completed in {time.time()-t0:.1f}s, model={model}")
txt = resp.choices[0].message.content or '{}'
data = json.loads(txt)
arr = data.get('prioritized') or []
out = [p for p in arr if isinstance(p, str)]
if out:
return out
return [it.rel for it in items]
except Exception as e:
print(f"[LLM] Rerank failed: {e}")
return [it.rel for it in items]
def run_cloc(root: Path) -> tuple[int, dict[str, int]]:
"""Run `cloc` if available; returns (total, by_language). Raises on errors."""
try:
cp = subprocess.run(["cloc", "--json", "--quiet", str(root)], capture_output=True, text=True, check=True)
data = json.loads(cp.stdout or '{}')
total = int((data.get('SUM') or {}).get('code') or 0)
by_lang: dict[str, int] = {}
for k, v in data.items():
if k == 'header' or k == 'SUM' or not isinstance(v, dict):
continue
code = int(v.get('code') or 0)
by_lang[str(k)] = code
return total, by_lang
except Exception as e:
raise RuntimeError(f"cloc failed: {e}")
def approx_repo_loc_by_ext(root: Path, workers: int = 8, only_ext: Optional[Set[str]] = None) -> tuple[int, dict[str, int]]:
total = 0
by_ext: dict[str, int] = {}
paths = list(iter_candidates(root, only_ext=only_ext))
with ThreadPoolExecutor(max_workers=max(2, workers)) as ex:
futs = {ex.submit(count_loc, p): p for p in paths}
for fut in as_completed(futs):
p = futs[fut]
try:
loc = fut.result()
except Exception:
loc = 0
if loc <= 0:
continue
total += loc
by_ext[p.suffix.lower()] = by_ext.get(p.suffix.lower(), 0) + loc
return total, by_ext
def compile_whitelist(prioritized_paths: List[str], by_rel: dict[str, FileInfo], limit_loc: int) -> Tuple[List[str], int]:
selected: List[str] = []
total = 0
for rel in prioritized_paths:
info = by_rel.get(rel)
if not info:
continue
if total + info.loc > limit_loc and total > 0:
continue
selected.append(rel)
total += info.loc
if total >= limit_loc:
break
return selected, total
def apply_overrides(cfg_path: str):
try:
with open(cfg_path, 'r', encoding='utf-8') as f:
cfg = json.load(f)
except Exception as e:
print(f"[config] Failed to load overrides: {e}")
return
# Language weights
try:
lw = cfg.get('lang_weights') or {}
for k, v in lw.items():
LANG_WEIGHTS[str(k).lower()] = float(v)
except Exception:
pass
# Path boosts / penalties
def _compile_list(key: str, target_list: list, sign: str = 'boost'):
arr = cfg.get(key) or []
for item in arr:
try:
rx = re.compile(str(item['pattern']), re.I)
w = float(item['weight'])
target_list.append((rx, w))
except Exception:
continue
_compile_list('path_boosts', PATH_BOOSTS, 'boost')
_compile_list('path_penalties', PATH_PENALTIES, 'penalty')
# Config top and entrypoints
try:
top = cfg.get('config_top') or []
for name in top:
CONFIG_TOP.add(str(name))
except Exception:
pass
try:
eps = cfg.get('entrypoint_hints') or []
for pat in eps:
ENTRYPOINT_HINTS.append(re.compile(str(pat), re.I))
except Exception:
pass
def main(argv: List[str]) -> int:
ap = argparse.ArgumentParser(description='Build a Hound whitelist for a codebase.')
ap.add_argument('--input', required=True, help='Path to codebase root')
ap.add_argument('--limit-loc', type=int, default=20000, help='Total LOC budget for whitelist')
ap.add_argument('--output', default='whitelist.txt', help='Output whitelist file')
ap.add_argument('--enable-llm', action='store_true', help='Use LLM to rerank candidates')
ap.add_argument('--model', default='gpt-4o-mini', help='LLM model to use if enabled')
ap.add_argument('--max-llm-items', type=int, default=300, help='Max candidates to send to LLM')
ap.add_argument('--print-summary', action='store_true', help='Print selection summary to stdout')
ap.add_argument('--use-cloc', action='store_true', help='Use cloc for repo LOC totals if available')
ap.add_argument('--config', help='JSON file to override weights/patterns')
ap.add_argument('--verbose', action='store_true', help='Verbose progress output')
ap.add_argument('--no-progress', action='store_true', help='Disable progress updates')
ap.add_argument('--only-ext', help='Comma-separated list of extensions to include (e.g., "go,js,ts"). Case-insensitive, dot optional.')
ap.add_argument('--output-format', choices=['csv','lines'], default='csv', help='Output format for whitelist (default: csv)')
args = ap.parse_args(argv)
root = Path(args.input).resolve()
if not root.exists() or not root.is_dir():
print(f"Input path not found or not a directory: {root}")
return 2
if args.config:
apply_overrides(args.config)
only_ext_set: Optional[Set[str]] = None
if args.only_ext:
parts = [s.strip().lower() for s in str(args.only_ext).split(',') if s.strip()]
only_ext_set = { (p if p.startswith('.') else f'.{p}') for p in parts }
if args.verbose:
print(f"[filter] Restricting to extensions: {', '.join(sorted(only_ext_set))}")
items = build_candidates(root, workers=os.cpu_count() or 8, verbose=args.verbose, progress=not args.no_progress, only_ext=only_ext_set)
if not items:
print("No candidate files found.")
return 1
ranked = heuristic_rank(items)
by_rel = {it.rel: it for it in ranked}
if args.enable_llm:
if args.verbose:
print(f"[llm] Reranking top {min(len(ranked), args.max_llm_items)} candidates with {args.model}… (1 prompt)")
prioritized = llm_rerank(ranked, model=args.model, max_items=args.max_llm_items, verbose=args.verbose)
# Keep only those we have, in that order, then append unseen from heuristic
seen = set()
pruned = []
for rel in prioritized:
if rel in by_rel and rel not in seen:
pruned.append(rel); seen.add(rel)
for it in ranked:
if it.rel not in seen:
pruned.append(it.rel); seen.add(it.rel)
prioritized = pruned
else:
prioritized = [it.rel for it in ranked]
selected, total = compile_whitelist(prioritized, by_rel, args.limit_loc)
out_path = Path(args.output).resolve()
if args.output_format == 'csv':
out_text = ','.join(selected) + ('\n' if selected else '')
else:
out_text = '\n'.join(selected) + ('\n' if selected else '')
out_path.write_text(out_text, encoding='utf-8')
if args.print_summary:
# Group by ext
by_ext: dict[str, int] = {}
for rel in selected:
ext = Path(rel).suffix.lower()
by_ext[ext] = by_ext.get(ext, 0) + 1
print(f"Wrote {len(selected)} files ({total} LOC) to {out_path}")
print("Selected files by extension (count):")
for ext, cnt in sorted(by_ext.items(), key=lambda x: (-x[1], x[0])):
print(f" {ext or '(no ext)'}: {cnt}")
# Repo totals (cloc or approximate)
repo_total = 0
repo_by = {}
if args.use_cloc:
try:
repo_total, repo_by_lang = run_cloc(root)
print(f"Repo total CLOC (cloc): {repo_total}")
# Map selected ext totals to languages roughly by ext suffix in LANG_WEIGHTS
# If cloc used, still show per-ext included LOC below
except Exception as e:
print(f"[cloc] {e}; falling back to approximate totals")
repo_total, repo_by = approx_repo_loc_by_ext(root, only_ext=None)
else:
repo_total, repo_by = approx_repo_loc_by_ext(root, only_ext=None)
print(f"Repo total LOC (approx): {repo_total}")
# Included LOC per ext
included_by_ext: dict[str, int] = {}
for rel in selected:
ext = Path(rel).suffix.lower()
included_by_ext[ext] = included_by_ext.get(ext, 0) + (by_rel[rel].loc if rel in by_rel else 0)
print("Included LOC per extension (and coverage vs repo totals when available):")
for ext, inc in sorted(included_by_ext.items(), key=lambda x: (-x[1], x[0])):
repo_ext_total = repo_by.get(ext, 0)
if repo_ext_total > 0:
pct = int(inc * 100 / max(1, repo_ext_total))
print(f" {ext or '(no ext)'}: {inc} / {repo_ext_total} LOC ({pct}%)")
else:
print(f" {ext or '(no ext)'}: {inc} LOC (repo total unknown)")
return 0
if __name__ == '__main__':
sys.exit(main(sys.argv[1:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment