Skip to content

Instantly share code, notes, and snippets.

@pmarreck
Last active July 3, 2025 17:52
Show Gist options
  • Save pmarreck/19503053d5e91e8fff91c6e47204cbb2 to your computer and use it in GitHub Desktop.
Save pmarreck/19503053d5e91e8fff91c6e47204cbb2 to your computer and use it in GitHub Desktop.
A way to search all forks of a Github project to see if any of them already have a certain file, such as a flake.nix (the default)
#!/usr/bin/env bash
# find_github_forks_with_file
# Default values
REPO=""
FILE_TO_SEARCH="flake.nix"
MAX_JOBS=10
SCRIPT_NAME=$(basename "$0")
# Help function
show_help() {
cat << EOF
Usage: $SCRIPT_NAME [OPTIONS] owner/repo
Search for a specific file across all forks of a GitHub repository.
OPTIONS:
--file FILE File to search for (default: flake.nix)
-h, --help Show this help message
--about Show detailed information about this tool
EXAMPLES:
$SCRIPT_NAME microsoft/vscode
$SCRIPT_NAME --file package.json facebook/react
$SCRIPT_NAME --file Dockerfile kubernetes/kubernetes
REQUIREMENTS:
- gh (GitHub CLI) must be installed and authenticated
- Authenticated users have 5000 API requests/hour limit
EOF
}
# About function
show_about() {
cat << EOF
find_github_forks_with_file - GitHub Fork File Finder
This tool searches for a specific file across all forks of a GitHub repository.
It was originally designed to find flake.nix files in Nix projects, but can
search for any file.
FEATURES:
- Parallel processing (10 concurrent jobs by default)
- Live progress bar using efficient filesystem metadata
- API rate limit warnings for large repositories
- Proper error handling and progress feedback
- Constructs direct URLs to found files using correct default branch
DEPENDENCIES:
- gh (GitHub CLI): For accessing GitHub API
- A compatible 'stat' command (standard on macOS and Linux)
TECHNICAL DETAILS:
- Uses GitHub API via 'gh' CLI tool
- Handles command line length limits with xargs -s
- Uses an atomic, dependency-free method for progress tracking:
Each parallel worker appends a single byte to a temporary file upon completion.
The main script polls the size of this file using 'stat' (a fast metadata
operation) to update the progress bar without race conditions.
- Only fetches default branch info when files are found (efficient)
AUTHOR: Built for efficient Nix ecosystem exploration
EOF
}
# Dependency check function
check_dependencies() {
# Check for gh CLI
if ! command -v gh >/dev/null 2>&1; then
echo "Error: 'gh' (GitHub CLI) is required but not installed." >&2
echo "Please install it from: https://cli.github.com/" >&2
return 1
fi
return 0
}
# Determine the correct stat command for file size
STAT_CMD=""
# Check for gstat (GNU stat on macOS via coreutils) first for consistency
if command -v gstat >/dev/null 2>&1; then
STAT_CMD="gstat -c %s"
# Check if the system 'stat' supports GNU format
elif stat -c %s /dev/null >/dev/null 2>&1; then
STAT_CMD="stat -c %s"
# Check if the system 'stat' supports BSD format
elif stat -f %z /dev/null >/dev/null 2>&1; then
STAT_CMD="stat -f %z"
else
echo "Error: Cannot find a compatible 'stat' command to determine file size." >&2
echo "Please install GNU coreutils ('brew install coreutils' on macOS)." >&2
exit 4 # dependency error
fi
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--file)
FILE_TO_SEARCH="$2"
shift 2
;;
-h|--help)
show_help
exit 0
;;
--about)
show_about
exit 0
;;
-*)
echo "Unknown option: $1" >&2
show_help >&2
exit 2
;;
*)
if [[ -z "$REPO" ]]; then
REPO="$1"
else
echo "Too many arguments: $1" >&2
show_help >&2
exit 2
fi
shift
;;
esac
done
# Check all dependencies
if ! check_dependencies; then
exit 1 # dependency check failed
fi
if [[ -z "$REPO" ]]; then
echo "Error: Repository argument required" >&2
show_help >&2
exit 2 # usage error
fi
echo "Fetching forks of $REPO..."
# Get all forks into array (in-memory)
fork_data=$(gh api "repos/$REPO/forks" --paginate --jq '.[].full_name')
if [[ $? -ne 0 ]]; then
echo -e "\033[31mError fetching forks\033[0m" >&2
exit 3 # error fetching forks
fi
readarray -t forks <<< "$fork_data"
total_forks=${#forks[@]}
if [[ $total_forks -eq 0 ]]; then
echo "No forks found for $REPO."
exit 0
fi
echo "Found $total_forks forks. Checking for $FILE_TO_SEARCH in parallel (max $MAX_JOBS jobs)..."
# Warn about API limits for large fork counts
if [[ $total_forks -gt 1000 ]]; then
echo -e "\033[33mWarning: Checking $total_forks forks will make ~$((total_forks * 2)) API requests, worst-case.\033[0m"
echo -e "\033[33mGitHub's rate limit is 5000 requests/hour for authenticated users.\033[0m"
fi
# Create temporary files for progress and results in /tmp (for ramdisk performance)
progress_file="/tmp/find_forks_progress_$$"
results_file="/tmp/find_forks_results_$$"
# Ensure the temporary files exist before use
touch "$progress_file" "$results_file"
# Cleanup temp files on exit
trap 'rm -f "$progress_file" "$results_file"' EXIT
# Function to check a single fork and output result
check_fork() {
local fork="$1"
local file_to_search="$2"
local results_file_path="$3"
local progress_file_path="$4"
# Try to get the specified file directly from the repo
local api_response
if api_response=$(gh api "repos/$fork/contents/$file_to_search" 2>/dev/null); then
# Verify the response contains actual file data and has the right type
if [[ -n "$api_response" ]] && echo "$api_response" | jq -e '.type == "file"' >/dev/null 2>&1; then
# Get default branch
default_branch=$(gh api "repos/$fork" --jq '.default_branch')
url="https://github.com/$fork/blob/$default_branch/$file_to_search"
# Output result to the results file
echo "$fork|$url" >> "$results_file_path"
fi
fi
# Signal completion by appending a single byte to the progress file
echo -n "." >> "$progress_file_path"
}
# Export function and variables for parallel execution
export -f check_fork
export FILE_TO_SEARCH
export STAT_CMD
# Run checks in parallel in the background, redirecting stdout/stderr of the sub-process
printf '%s\n' "${forks[@]}" | xargs -s "$(getconf ARG_MAX)" -n 1 -P "$MAX_JOBS" -I {} bash -c 'check_fork "$@" >/dev/null 2>&1' _ {} "$FILE_TO_SEARCH" "$results_file" "$progress_file" &
xargs_pid=$!
# Progress bar
echo # newline before progress bar
while kill -0 "$xargs_pid" 2>/dev/null; do
completed_count=$($STAT_CMD "$progress_file" 2>/dev/null || echo 0)
# Ensure completed_count is a number, default to 0 if not
[[ "$completed_count" =~ ^[0-9]+$ ]] || completed_count=0
# Avoid division by zero
if [[ $total_forks -gt 0 ]]; then
percent=$((completed_count * 100 / total_forks))
else
percent=100
fi
bar_len=40
filled_len=$((bar_len * percent / 100))
bar=$(printf "%${filled_len}s" | tr ' ' '#')
empty=$(printf "%$(($bar_len - filled_len))s")
# \r moves cursor to beginning of line, -n prevents newline
printf "\rProgress: [%s%s] %d%% (%d/%d) " "$bar" "$empty" "$percent" "$completed_count" "$total_forks"
sleep 0.2
done
# Ensure the progress bar shows 100% at the end and move to the next line
printf "\rProgress: [%s] 100%% (%d/%d)\n" "$(printf "%${bar_len}s" | tr ' ' '#')" "$total_forks" "$total_forks"
echo
# Read results from the file
readarray -t results < "$results_file"
echo "Search complete!"
echo
# Output summary
if [[ ${#results[@]} -gt 0 ]]; then
echo "Found $FILE_TO_SEARCH in ${#results[@]} out of $total_forks forks:"
echo
for result in "${results[@]}"; do
IFS='|' read -r fork_name url <<< "$result"
echo "$fork_name"
echo " $url"
echo
done
else
echo -e "\033[31mNo forks with $FILE_TO_SEARCH found.\033[0m"
fi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment