Last active
April 17, 2025 08:35
-
-
Save jlmalone/1858084a3a15b98e3758355e0a38bc28 to your computer and use it in GitHub Desktop.
Script to summarise codebases, making an effort to strip passwords and keys, so entire codebases, or large sections can be concatenated particularly for upload to LLMs without dealing with multiple files
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# --------------------------------------------------------------------------------- | |
# summarise.sh | |
# A bash script that recursively crawls a directory, concatenates text-based code | |
# files into a single output file, and redacts sensitive info within quoted strings. | |
# | |
# Usage: | |
# ./summarise.sh <target_directory> <output_file> [custom_ignores_file] | |
# | |
# Example: | |
# ./summarise.sh ./my_project output.txt custom_ignores.txt | |
# | |
# MIT License | |
# Copyright (c) 2025 jlmalone | |
# | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software. | |
# | |
# Permission is hereby granted to do so, subject to the following conditions: | |
# | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# | |
# THE SOFTWARE IS PROVIDED "AS IS", "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
# --------------------------------------------------------------------------------- | |
# --- Configuration --- | |
#!/usr/bin/env bash | |
set -euo pipefail | |
VERBOSE=1 | |
BIN_CHECK="grep -Iq ." | |
DEFAULT_EXTRA='.git/ | |
*.log' | |
log(){ (( VERBOSE )) && echo "$*" >&2; } | |
usage(){ echo "Usage: $0 <dir> <out> [custom_ignore_file]" >&2; exit 1; } | |
(( $# < 2 )) && usage | |
ROOT="$(cd "$1" && pwd -P)" | |
OUT="$(cd "$(dirname "$2")" && pwd -P)/$(basename "$2")" | |
EXTRA="${3:-}" # Optional path to a file with custom ignore patterns | |
[[ ! -d "$ROOT" ]] && { echo "ERROR: '$ROOT' not found." >&2; exit 1; } | |
# Ensure output file exists and is empty | |
> "$OUT" | |
######################################################################## | |
# 1. Build EXTRA_PATTERNS array from custom_ignores + defaults | |
######################################################################## | |
EXTRA_PATTERNS=() | |
read_patterns() { | |
local src="$1" source_name="$2" | |
[[ ! -f "$src" ]] && { [[ -n "$source_name" ]] && log "Ignore source '$source_name' not found at '$src'"; return; } | |
log "Reading ignore patterns from '$src'" | |
while IFS= read -r line; do | |
# 1. Strip inline comments first (anything after '#') | |
line="${line%%#*}" | |
# 2. Check if line is now blank (was only comment or spaces) | |
[[ -z "${line//[[:space:]]/}" ]] && continue | |
# 3. Remove trailing spaces / CR | |
line="$(echo "$line" | sed 's/[[:space:]]*$//')" | |
line="${line%$'\r'}" # Remove trailing CR if present | |
# 4. Remove leading spaces | |
line="$(echo "$line" | sed 's/^[[:space:]]*//')" | |
# Ensure we don't add empty lines if stripping led to empty | |
if [[ -n "$line" ]]; then | |
log "Adding pattern: [$line]" # Debug log | |
EXTRA_PATTERNS+=("$line") | |
fi | |
done < "$src" | |
} | |
# Read default patterns first | |
TMP_DEFAULT=$(mktemp); printf '%s\n' "$DEFAULT_EXTRA" > "$TMP_DEFAULT" | |
read_patterns "$TMP_DEFAULT" "Defaults" | |
# Read custom patterns if specified | |
[[ -n "$EXTRA" ]] && read_patterns "$EXTRA" "Custom file" | |
# Clean up temporary file | |
rm -f "$TMP_DEFAULT" | |
log "Compiled ${#EXTRA_PATTERNS[@]} extra ignore patterns." | |
matches_extra_ignore() { | |
local rel="$1" # The relative path of the file from ROOT | |
local p | |
# Add slashes around rel for consistent component matching below | |
local rel_padded="/$rel/" | |
for p in "${EXTRA_PATTERNS[@]}"; do | |
if [[ "$p" == /* ]]; then # Anchored pattern (starts with /) | |
# Match against the full relative path prepended with / | |
[[ "/$rel" == $p ]] && return 0 | |
elif [[ "$p" == */ && "$p" != "/" ]]; then # Directory pattern (ends with /, not just "/") | |
# Check if the padded path contains the directory component pattern | |
[[ "$rel_padded" == *"/${p}"* ]] && return 0 | |
else # General glob pattern (non-anchored, not ending in /) | |
# --- MODIFIED BLOCK --- | |
# Check if the BASENAME of the relative path matches the pattern. | |
# Mimics gitignore behavior for simple patterns like "file.log" or "*.tmp", matching anywhere. | |
# ${rel##*/} extracts the filename part of the path. | |
[[ "${rel##*/}" == $p ]] && return 0 | |
# --- END MODIFIED BLOCK --- | |
fi | |
done | |
return 1 | |
} | |
skip_file() { | |
local f="$1" rel="$2" | |
local filename="${rel##*/}" # Extract basename (the part after the last '/') | |
# --- NEW HARDCODED RULE --- | |
# Always skip files named .gitignore, regardless of other patterns | |
if [[ "$filename" == ".gitignore" ]]; then | |
log "skip hardcoded: $rel (.gitignore)" | |
return 0 # 0 indicates the file should be skipped | |
fi | |
# --- END NEW RULE --- | |
# Apply custom/default ignore patterns defined in EXTRA_PATTERNS | |
matches_extra_ignore "$rel" && { log "skip extra : $rel"; return 0; } | |
# Check for binary files (grep -I returns non-zero for binary) | |
$BIN_CHECK "$f" || { log "skip binary: $rel"; return 0; } | |
# Check for empty files | |
[[ -s "$f" ]] || { log "skip empty : $rel"; return 0; } | |
# If none of the above skip conditions were met, don't skip | |
return 1 # 1 indicates the file should NOT be skipped | |
} | |
append_file() { | |
local f="$1" rel="$2" | |
{ | |
# Add a clear separator | |
printf '\n\n=== FILE: %s ===\n\n' "$rel" | |
# Remove NUL bytes and attempt case-insensitive secret redaction | |
tr -d '\000' < "$f" \ | |
| sed -E \ | |
-e 's/("(key|password|token|secret)"[[:space:]]*:[[:space:]]*")[^"]+"/\1REDACTED"/gi' \ | |
-e 's/((key|password|token|secret)[[:space:]]*=[[:space:]]*)[^[:space:]]+/\1REDACTED/gi' \ | |
-e 's/((key|password|token|secret)[[:space:]]*:[[:space:]]*)[^[:space:]]+/\1REDACTED/gi' | |
} >> "$OUT" | |
} | |
######################################################################## | |
# 2. Use Git to list every candidate file OR fallback to find | |
######################################################################## | |
if [[ -d "$ROOT/.git" ]] && command -v git >/dev/null; then | |
log "Git repo detected, using 'git ls-files'..." | |
git -C "$ROOT" ls-files -co --exclude-standard -z \ | |
| while IFS= read -r -d '' rel; do | |
f="$ROOT/$rel" | |
# Check if it's a readable regular file | |
if [[ -f "$f" ]] && [[ -r "$f" ]]; then | |
skip_file "$f" "$rel" || { log "add git: $rel"; append_file "$f" "$rel"; } | |
else | |
log "skip non-file/unreadable: $rel" | |
fi | |
done | |
else | |
log "No .git directory or git command not found — using 'find'..." | |
find "$ROOT" -type f -print0 | | |
while IFS= read -r -d '' f; do | |
rel="${f#$ROOT/}" | |
# Check if file is readable | |
if [[ -r "$f" ]]; then | |
skip_file "$f" "$rel" || { log "add find: $rel"; append_file "$f" "$rel"; } | |
else | |
log "skip unreadable: $rel" | |
fi | |
done | |
fi | |
# Final summary message | |
log "DONE → $(wc -c <"$OUT") bytes written to '$OUT'." | |
head -4 "$OUT" >&2 |
/Users/{username}/.local/bin/summarise.sh
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
https://grok.com/chat/38df1a3f-052a-4327-8ac5-ea8d6c6331bc?referrer=website