Skip to content

Instantly share code, notes, and snippets.

@jlmalone
Last active April 17, 2025 08:35
Show Gist options
  • Save jlmalone/1858084a3a15b98e3758355e0a38bc28 to your computer and use it in GitHub Desktop.
Save jlmalone/1858084a3a15b98e3758355e0a38bc28 to your computer and use it in GitHub Desktop.
Script to summarise codebases, making an effort to strip passwords and keys, so entire codebases, or large sections can be concatenated particularly for upload to LLMs without dealing with multiple files
#!/usr/bin/env bash
# ---------------------------------------------------------------------------------
# summarise.sh
# A bash script that recursively crawls a directory, concatenates text-based code
# files into a single output file, and redacts sensitive info within quoted strings.
#
# Usage:
# ./summarise.sh <target_directory> <output_file> [custom_ignores_file]
#
# Example:
# ./summarise.sh ./my_project output.txt custom_ignores.txt
#
# MIT License
# Copyright (c) 2025 jlmalone
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software.
#
# Permission is hereby granted to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# ---------------------------------------------------------------------------------
# --- Configuration ---
#!/usr/bin/env bash
set -euo pipefail
VERBOSE=1
BIN_CHECK="grep -Iq ."
DEFAULT_EXTRA='.git/
*.log'
log(){ (( VERBOSE )) && echo "$*" >&2; }
usage(){ echo "Usage: $0 <dir> <out> [custom_ignore_file]" >&2; exit 1; }
(( $# < 2 )) && usage
ROOT="$(cd "$1" && pwd -P)"
OUT="$(cd "$(dirname "$2")" && pwd -P)/$(basename "$2")"
EXTRA="${3:-}" # Optional path to a file with custom ignore patterns
[[ ! -d "$ROOT" ]] && { echo "ERROR: '$ROOT' not found." >&2; exit 1; }
# Ensure output file exists and is empty
> "$OUT"
########################################################################
# 1. Build EXTRA_PATTERNS array from custom_ignores + defaults
########################################################################
EXTRA_PATTERNS=()
read_patterns() {
local src="$1" source_name="$2"
[[ ! -f "$src" ]] && { [[ -n "$source_name" ]] && log "Ignore source '$source_name' not found at '$src'"; return; }
log "Reading ignore patterns from '$src'"
while IFS= read -r line; do
# 1. Strip inline comments first (anything after '#')
line="${line%%#*}"
# 2. Check if line is now blank (was only comment or spaces)
[[ -z "${line//[[:space:]]/}" ]] && continue
# 3. Remove trailing spaces / CR
line="$(echo "$line" | sed 's/[[:space:]]*$//')"
line="${line%$'\r'}" # Remove trailing CR if present
# 4. Remove leading spaces
line="$(echo "$line" | sed 's/^[[:space:]]*//')"
# Ensure we don't add empty lines if stripping led to empty
if [[ -n "$line" ]]; then
log "Adding pattern: [$line]" # Debug log
EXTRA_PATTERNS+=("$line")
fi
done < "$src"
}
# Read default patterns first
TMP_DEFAULT=$(mktemp); printf '%s\n' "$DEFAULT_EXTRA" > "$TMP_DEFAULT"
read_patterns "$TMP_DEFAULT" "Defaults"
# Read custom patterns if specified
[[ -n "$EXTRA" ]] && read_patterns "$EXTRA" "Custom file"
# Clean up temporary file
rm -f "$TMP_DEFAULT"
log "Compiled ${#EXTRA_PATTERNS[@]} extra ignore patterns."
matches_extra_ignore() {
local rel="$1" # The relative path of the file from ROOT
local p
# Add slashes around rel for consistent component matching below
local rel_padded="/$rel/"
for p in "${EXTRA_PATTERNS[@]}"; do
if [[ "$p" == /* ]]; then # Anchored pattern (starts with /)
# Match against the full relative path prepended with /
[[ "/$rel" == $p ]] && return 0
elif [[ "$p" == */ && "$p" != "/" ]]; then # Directory pattern (ends with /, not just "/")
# Check if the padded path contains the directory component pattern
[[ "$rel_padded" == *"/${p}"* ]] && return 0
else # General glob pattern (non-anchored, not ending in /)
# --- MODIFIED BLOCK ---
# Check if the BASENAME of the relative path matches the pattern.
# Mimics gitignore behavior for simple patterns like "file.log" or "*.tmp", matching anywhere.
# ${rel##*/} extracts the filename part of the path.
[[ "${rel##*/}" == $p ]] && return 0
# --- END MODIFIED BLOCK ---
fi
done
return 1
}
skip_file() {
local f="$1" rel="$2"
local filename="${rel##*/}" # Extract basename (the part after the last '/')
# --- NEW HARDCODED RULE ---
# Always skip files named .gitignore, regardless of other patterns
if [[ "$filename" == ".gitignore" ]]; then
log "skip hardcoded: $rel (.gitignore)"
return 0 # 0 indicates the file should be skipped
fi
# --- END NEW RULE ---
# Apply custom/default ignore patterns defined in EXTRA_PATTERNS
matches_extra_ignore "$rel" && { log "skip extra : $rel"; return 0; }
# Check for binary files (grep -I returns non-zero for binary)
$BIN_CHECK "$f" || { log "skip binary: $rel"; return 0; }
# Check for empty files
[[ -s "$f" ]] || { log "skip empty : $rel"; return 0; }
# If none of the above skip conditions were met, don't skip
return 1 # 1 indicates the file should NOT be skipped
}
append_file() {
local f="$1" rel="$2"
{
# Add a clear separator
printf '\n\n=== FILE: %s ===\n\n' "$rel"
# Remove NUL bytes and attempt case-insensitive secret redaction
tr -d '\000' < "$f" \
| sed -E \
-e 's/("(key|password|token|secret)"[[:space:]]*:[[:space:]]*")[^"]+"/\1REDACTED"/gi' \
-e 's/((key|password|token|secret)[[:space:]]*=[[:space:]]*)[^[:space:]]+/\1REDACTED/gi' \
-e 's/((key|password|token|secret)[[:space:]]*:[[:space:]]*)[^[:space:]]+/\1REDACTED/gi'
} >> "$OUT"
}
########################################################################
# 2. Use Git to list every candidate file OR fallback to find
########################################################################
if [[ -d "$ROOT/.git" ]] && command -v git >/dev/null; then
log "Git repo detected, using 'git ls-files'..."
git -C "$ROOT" ls-files -co --exclude-standard -z \
| while IFS= read -r -d '' rel; do
f="$ROOT/$rel"
# Check if it's a readable regular file
if [[ -f "$f" ]] && [[ -r "$f" ]]; then
skip_file "$f" "$rel" || { log "add git: $rel"; append_file "$f" "$rel"; }
else
log "skip non-file/unreadable: $rel"
fi
done
else
log "No .git directory or git command not found — using 'find'..."
find "$ROOT" -type f -print0 |
while IFS= read -r -d '' f; do
rel="${f#$ROOT/}"
# Check if file is readable
if [[ -r "$f" ]]; then
skip_file "$f" "$rel" || { log "add find: $rel"; append_file "$f" "$rel"; }
else
log "skip unreadable: $rel"
fi
done
fi
# Final summary message
log "DONE → $(wc -c <"$OUT") bytes written to '$OUT'."
head -4 "$OUT" >&2
@jlmalone
Copy link
Author

/Users/{username}/.local/bin/summarise.sh

@jlmalone
Copy link
Author

@jlmalone
Copy link
Author

@jlmalone
Copy link
Author

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment