Last active
July 3, 2025 17:52
-
-
Save pmarreck/19503053d5e91e8fff91c6e47204cbb2 to your computer and use it in GitHub Desktop.
A way to search all forks of a Github project to see if any of them already have a certain file, such as a flake.nix (the default)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env bash | |
# find_github_forks_with_file | |
# Default values | |
REPO="" | |
FILE_TO_SEARCH="flake.nix" | |
MAX_JOBS=10 | |
SCRIPT_NAME=$(basename "$0") | |
# Help function | |
show_help() { | |
cat << EOF | |
Usage: $SCRIPT_NAME [OPTIONS] owner/repo | |
Search for a specific file across all forks of a GitHub repository. | |
OPTIONS: | |
--file FILE File to search for (default: flake.nix) | |
-h, --help Show this help message | |
--about Show detailed information about this tool | |
EXAMPLES: | |
$SCRIPT_NAME microsoft/vscode | |
$SCRIPT_NAME --file package.json facebook/react | |
$SCRIPT_NAME --file Dockerfile kubernetes/kubernetes | |
REQUIREMENTS: | |
- gh (GitHub CLI) must be installed and authenticated | |
- Authenticated users have 5000 API requests/hour limit | |
EOF | |
} | |
# About function | |
show_about() { | |
cat << EOF | |
find_github_forks_with_file - GitHub Fork File Finder | |
This tool searches for a specific file across all forks of a GitHub repository. | |
It was originally designed to find flake.nix files in Nix projects, but can | |
search for any file. | |
FEATURES: | |
- Parallel processing (10 concurrent jobs by default) | |
- Live progress bar using efficient filesystem metadata | |
- API rate limit warnings for large repositories | |
- Proper error handling and progress feedback | |
- Constructs direct URLs to found files using correct default branch | |
DEPENDENCIES: | |
- gh (GitHub CLI): For accessing GitHub API | |
- A compatible 'stat' command (standard on macOS and Linux) | |
TECHNICAL DETAILS: | |
- Uses GitHub API via 'gh' CLI tool | |
- Handles command line length limits with xargs -s | |
- Uses an atomic, dependency-free method for progress tracking: | |
Each parallel worker appends a single byte to a temporary file upon completion. | |
The main script polls the size of this file using 'stat' (a fast metadata | |
operation) to update the progress bar without race conditions. | |
- Only fetches default branch info when files are found (efficient) | |
AUTHOR: Built for efficient Nix ecosystem exploration | |
EOF | |
} | |
# Dependency check function | |
check_dependencies() { | |
# Check for gh CLI | |
if ! command -v gh >/dev/null 2>&1; then | |
echo "Error: 'gh' (GitHub CLI) is required but not installed." >&2 | |
echo "Please install it from: https://cli.github.com/" >&2 | |
return 1 | |
fi | |
return 0 | |
} | |
# Determine the correct stat command for file size | |
STAT_CMD="" | |
# Check for gstat (GNU stat on macOS via coreutils) first for consistency | |
if command -v gstat >/dev/null 2>&1; then | |
STAT_CMD="gstat -c %s" | |
# Check if the system 'stat' supports GNU format | |
elif stat -c %s /dev/null >/dev/null 2>&1; then | |
STAT_CMD="stat -c %s" | |
# Check if the system 'stat' supports BSD format | |
elif stat -f %z /dev/null >/dev/null 2>&1; then | |
STAT_CMD="stat -f %z" | |
else | |
echo "Error: Cannot find a compatible 'stat' command to determine file size." >&2 | |
echo "Please install GNU coreutils ('brew install coreutils' on macOS)." >&2 | |
exit 4 # dependency error | |
fi | |
# Parse command line arguments | |
while [[ $# -gt 0 ]]; do | |
case $1 in | |
--file) | |
FILE_TO_SEARCH="$2" | |
shift 2 | |
;; | |
-h|--help) | |
show_help | |
exit 0 | |
;; | |
--about) | |
show_about | |
exit 0 | |
;; | |
-*) | |
echo "Unknown option: $1" >&2 | |
show_help >&2 | |
exit 2 | |
;; | |
*) | |
if [[ -z "$REPO" ]]; then | |
REPO="$1" | |
else | |
echo "Too many arguments: $1" >&2 | |
show_help >&2 | |
exit 2 | |
fi | |
shift | |
;; | |
esac | |
done | |
# Check all dependencies | |
if ! check_dependencies; then | |
exit 1 # dependency check failed | |
fi | |
if [[ -z "$REPO" ]]; then | |
echo "Error: Repository argument required" >&2 | |
show_help >&2 | |
exit 2 # usage error | |
fi | |
echo "Fetching forks of $REPO..." | |
# Get all forks into array (in-memory) | |
fork_data=$(gh api "repos/$REPO/forks" --paginate --jq '.[].full_name') | |
if [[ $? -ne 0 ]]; then | |
echo -e "\033[31mError fetching forks\033[0m" >&2 | |
exit 3 # error fetching forks | |
fi | |
readarray -t forks <<< "$fork_data" | |
total_forks=${#forks[@]} | |
if [[ $total_forks -eq 0 ]]; then | |
echo "No forks found for $REPO." | |
exit 0 | |
fi | |
echo "Found $total_forks forks. Checking for $FILE_TO_SEARCH in parallel (max $MAX_JOBS jobs)..." | |
# Warn about API limits for large fork counts | |
if [[ $total_forks -gt 1000 ]]; then | |
echo -e "\033[33mWarning: Checking $total_forks forks will make ~$((total_forks * 2)) API requests, worst-case.\033[0m" | |
echo -e "\033[33mGitHub's rate limit is 5000 requests/hour for authenticated users.\033[0m" | |
fi | |
# Create temporary files for progress and results in /tmp (for ramdisk performance) | |
progress_file="/tmp/find_forks_progress_$$" | |
results_file="/tmp/find_forks_results_$$" | |
# Ensure the temporary files exist before use | |
touch "$progress_file" "$results_file" | |
# Cleanup temp files on exit | |
trap 'rm -f "$progress_file" "$results_file"' EXIT | |
# Function to check a single fork and output result | |
check_fork() { | |
local fork="$1" | |
local file_to_search="$2" | |
local results_file_path="$3" | |
local progress_file_path="$4" | |
# Try to get the specified file directly from the repo | |
local api_response | |
if api_response=$(gh api "repos/$fork/contents/$file_to_search" 2>/dev/null); then | |
# Verify the response contains actual file data and has the right type | |
if [[ -n "$api_response" ]] && echo "$api_response" | jq -e '.type == "file"' >/dev/null 2>&1; then | |
# Get default branch | |
default_branch=$(gh api "repos/$fork" --jq '.default_branch') | |
url="https://github.com/$fork/blob/$default_branch/$file_to_search" | |
# Output result to the results file | |
echo "$fork|$url" >> "$results_file_path" | |
fi | |
fi | |
# Signal completion by appending a single byte to the progress file | |
echo -n "." >> "$progress_file_path" | |
} | |
# Export function and variables for parallel execution | |
export -f check_fork | |
export FILE_TO_SEARCH | |
export STAT_CMD | |
# Run checks in parallel in the background, redirecting stdout/stderr of the sub-process | |
printf '%s\n' "${forks[@]}" | xargs -s "$(getconf ARG_MAX)" -n 1 -P "$MAX_JOBS" -I {} bash -c 'check_fork "$@" >/dev/null 2>&1' _ {} "$FILE_TO_SEARCH" "$results_file" "$progress_file" & | |
xargs_pid=$! | |
# Progress bar | |
echo # newline before progress bar | |
while kill -0 "$xargs_pid" 2>/dev/null; do | |
completed_count=$($STAT_CMD "$progress_file" 2>/dev/null || echo 0) | |
# Ensure completed_count is a number, default to 0 if not | |
[[ "$completed_count" =~ ^[0-9]+$ ]] || completed_count=0 | |
# Avoid division by zero | |
if [[ $total_forks -gt 0 ]]; then | |
percent=$((completed_count * 100 / total_forks)) | |
else | |
percent=100 | |
fi | |
bar_len=40 | |
filled_len=$((bar_len * percent / 100)) | |
bar=$(printf "%${filled_len}s" | tr ' ' '#') | |
empty=$(printf "%$(($bar_len - filled_len))s") | |
# \r moves cursor to beginning of line, -n prevents newline | |
printf "\rProgress: [%s%s] %d%% (%d/%d) " "$bar" "$empty" "$percent" "$completed_count" "$total_forks" | |
sleep 0.2 | |
done | |
# Ensure the progress bar shows 100% at the end and move to the next line | |
printf "\rProgress: [%s] 100%% (%d/%d)\n" "$(printf "%${bar_len}s" | tr ' ' '#')" "$total_forks" "$total_forks" | |
echo | |
# Read results from the file | |
readarray -t results < "$results_file" | |
echo "Search complete!" | |
echo | |
# Output summary | |
if [[ ${#results[@]} -gt 0 ]]; then | |
echo "Found $FILE_TO_SEARCH in ${#results[@]} out of $total_forks forks:" | |
echo | |
for result in "${results[@]}"; do | |
IFS='|' read -r fork_name url <<< "$result" | |
echo "✓ $fork_name" | |
echo " $url" | |
echo | |
done | |
else | |
echo -e "\033[31mNo forks with $FILE_TO_SEARCH found.\033[0m" | |
fi |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment