pmarreck · July 3, 2025 17:52
diff --git a/find_github_forks_with_file.bash b/find_github_forks_with_file.bash
 #!/usr/bin/env bash
 # find_github_forks_with_file

 # Default values
 REPO=""
 FILE_TO_SEARCH="flake.nix"
 MAX_JOBS=10
 SCRIPT_NAME=$(basename "$0")

 # Help function
 show_help() {
 	cat << EOF
 Usage: $SCRIPT_NAME [OPTIONS] owner/repo

 Search for a specific file across all forks of a GitHub repository.

 OPTIONS:
  --file FILE     File to search for (default: flake.nix)
  -h, --help      Show this help message
  --about         Show detailed information about this tool

 EXAMPLES:
  $SCRIPT_NAME microsoft/vscode
  $SCRIPT_NAME --file package.json facebook/react
  $SCRIPT_NAME --file Dockerfile kubernetes/kubernetes

 REQUIREMENTS:
  - gh (GitHub CLI) must be installed and authenticated
  - Authenticated users have 5000 API requests/hour limit
 EOF
 }

 # About function
 show_about() {
 	cat << EOF
 find_github_forks_with_file - GitHub Fork File Finder

 This tool searches for a specific file across all forks of a GitHub repository.
 It was originally designed to find flake.nix files in Nix projects, but can
 search for any file.

 FEATURES:
 - Parallel processing (10 concurrent jobs by default)
 - Live progress bar using efficient filesystem metadata
 - API rate limit warnings for large repositories
 - Proper error handling and progress feedback
 - Constructs direct URLs to found files using correct default branch

 DEPENDENCIES:
 - gh (GitHub CLI): For accessing GitHub API
 - A compatible 'stat' command (standard on macOS and Linux)

 TECHNICAL DETAILS:
 - Uses GitHub API via 'gh' CLI tool
 - Handles command line length limits with xargs -s
 - Uses an atomic, dependency-free method for progress tracking:
  Each parallel worker appends a single byte to a temporary file upon completion.
  The main script polls the size of this file using 'stat' (a fast metadata
  operation) to update the progress bar without race conditions.
 - Only fetches default branch info when files are found (efficient)

 AUTHOR: Built for efficient Nix ecosystem exploration
 EOF
 }

 # Dependency check function
 check_dependencies() {
 	# Check for gh CLI
 	if ! command -v gh >/dev/null 2>&1; then
 		echo "Error: 'gh' (GitHub CLI) is required but not installed." >&2
 		echo "Please install it from: https://cli.github.com/" >&2
 		return 1
 	fi
 	return 0
 }

 # Determine the correct stat command for file size
 STAT_CMD=""
 # Check for gstat (GNU stat on macOS via coreutils) first for consistency
 if command -v gstat >/dev/null 2>&1; then
 	STAT_CMD="gstat -c %s"
 # Check if the system 'stat' supports GNU format
 elif stat -c %s /dev/null >/dev/null 2>&1; then
 	STAT_CMD="stat -c %s"
 # Check if the system 'stat' supports BSD format
 elif stat -f %z /dev/null >/dev/null 2>&1; then
 	STAT_CMD="stat -f %z"
 else
 	echo "Error: Cannot find a compatible 'stat' command to determine file size." >&2
 	echo "Please install GNU coreutils ('brew install coreutils' on macOS)." >&2
 	exit 4 # dependency error
 fi

 # Parse command line arguments
 while [[ $# -gt 0 ]]; do
 	case $1 in
 		--file)
 			FILE_TO_SEARCH="$2"
 			shift 2
 			;;
 		-h|--help)
 			show_help
 			exit 0
 			;;
 		--about)
 			show_about
 			exit 0
 			;;
 		-*)
 			echo "Unknown option: $1" >&2
 			show_help >&2
 			exit 2
 			;;
 		*)
 			if [[ -z "$REPO" ]]; then
 				REPO="$1"
 			else
 				echo "Too many arguments: $1" >&2
 				show_help >&2
 				exit 2
 			fi
 			shift
 			;;
 	esac
 done

 # Check all dependencies
 if ! check_dependencies; then
 	exit 1 # dependency check failed
 fi

 if [[ -z "$REPO" ]]; then
 	echo "Error: Repository argument required" >&2
 	show_help >&2
 	exit 2 # usage error
 fi

 echo "Fetching forks of $REPO..."

 # Get all forks into array (in-memory)
 fork_data=$(gh api "repos/$REPO/forks" --paginate --jq '.[].full_name')
 if [[ $? -ne 0 ]]; then
 	echo -e "\033[31mError fetching forks\033[0m" >&2
 	exit 3 # error fetching forks
 fi
 readarray -t forks <<< "$fork_data"
 total_forks=${#forks[@]}

 if [[ $total_forks -eq 0 ]]; then
 	echo "No forks found for $REPO."
 	exit 0
 fi

 echo "Found $total_forks forks. Checking for $FILE_TO_SEARCH in parallel (max $MAX_JOBS jobs)..."

 # Warn about API limits for large fork counts
 if [[ $total_forks -gt 1000 ]]; then
 	echo -e "\033[33mWarning: Checking $total_forks forks will make ~$((total_forks * 2)) API requests, worst-case.\033[0m"
 	echo -e "\033[33mGitHub's rate limit is 5000 requests/hour for authenticated users.\033[0m"
 fi

 # Create temporary files for progress and results in /tmp (for ramdisk performance)
 progress_file="/tmp/find_forks_progress_$$"
 results_file="/tmp/find_forks_results_$$"
 # Ensure the temporary files exist before use
 touch "$progress_file" "$results_file"

 # Cleanup temp files on exit
 trap 'rm -f "$progress_file" "$results_file"' EXIT

 # Function to check a single fork and output result
 check_fork() {
 	local fork="$1"
 	local file_to_search="$2"
 	local results_file_path="$3"
 	local progress_file_path="$4"

 	# Try to get the specified file directly from the repo
 	local api_response
 	if api_response=$(gh api "repos/$fork/contents/$file_to_search" 2>/dev/null); then
 		# Verify the response contains actual file data and has the right type
 		if [[ -n "$api_response" ]] && echo "$api_response" | jq -e '.type == "file"' >/dev/null 2>&1; then
 			# Get default branch
 			default_branch=$(gh api "repos/$fork" --jq '.default_branch')
 			url="https://github.com/$fork/blob/$default_branch/$file_to_search"
 			# Output result to the results file
 			echo "$fork|$url" >> "$results_file_path"
 		fi
 	fi
 	# Signal completion by appending a single byte to the progress file
 	echo -n "." >> "$progress_file_path"
 }

 # Export function and variables for parallel execution
 export -f check_fork
 export FILE_TO_SEARCH
 export STAT_CMD

 # Run checks in parallel in the background, redirecting stdout/stderr of the sub-process
 printf '%s\n' "${forks[@]}" | xargs -s "$(getconf ARG_MAX)" -n 1 -P "$MAX_JOBS" -I {} bash -c 'check_fork "$@" >/dev/null 2>&1' _ {} "$FILE_TO_SEARCH" "$results_file" "$progress_file" &
 xargs_pid=$!

 # Progress bar
 echo # newline before progress bar
 while kill -0 "$xargs_pid" 2>/dev/null; do
 	completed_count=$($STAT_CMD "$progress_file" 2>/dev/null || echo 0)
 	# Ensure completed_count is a number, default to 0 if not
 	[[ "$completed_count" =~ ^[0-9]+$ ]] || completed_count=0

 	# Avoid division by zero
 	if [[ $total_forks -gt 0 ]]; then
 		percent=$((completed_count * 100 / total_forks))
 	else
 		percent=100
 	fi

 	bar_len=40
 	filled_len=$((bar_len * percent / 100))
 	bar=$(printf "%${filled_len}s" | tr ' ' '#')
 	empty=$(printf "%$(($bar_len - filled_len))s")

 	# \r moves cursor to beginning of line, -n prevents newline
 	printf "\rProgress: [%s%s] %d%% (%d/%d) " "$bar" "$empty" "$percent" "$completed_count" "$total_forks"
 	sleep 0.2
 done

 # Ensure the progress bar shows 100% at the end and move to the next line
 printf "\rProgress: [%s] 100%% (%d/%d)\n" "$(printf "%${bar_len}s" | tr ' ' '#')" "$total_forks" "$total_forks"
 echo

 # Read results from the file
 readarray -t results < "$results_file"

 echo "Search complete!"
 echo

 # Output summary
 if [[ ${#results[@]} -gt 0 ]]; then
 	echo "Found $FILE_TO_SEARCH in ${#results[@]} out of $total_forks forks:"
 	echo

 	for result in "${results[@]}"; do
 		IFS='|' read -r fork_name url <<< "$result"
 		echo "✓ $fork_name"
 		echo "  $url"
 		echo
 	done
 else
 	echo -e "\033[31mNo forks with $FILE_TO_SEARCH found.\033[0m"
 fi
	#!/usr/bin/env bash
	# find_github_forks_with_file

	# Default values
	REPO=""
	FILE_TO_SEARCH="flake.nix"
	MAX_JOBS=10
	SCRIPT_NAME=$(basename "$0")

	# Help function
	show_help() {
	cat << EOF
	Usage: $SCRIPT_NAME [OPTIONS] owner/repo

	Search for a specific file across all forks of a GitHub repository.

	OPTIONS:
	--file FILE File to search for (default: flake.nix)
	-h, --help Show this help message
	--about Show detailed information about this tool

	EXAMPLES:
	$SCRIPT_NAME microsoft/vscode
	$SCRIPT_NAME --file package.json facebook/react
	$SCRIPT_NAME --file Dockerfile kubernetes/kubernetes

	REQUIREMENTS:
	- gh (GitHub CLI) must be installed and authenticated
	- Authenticated users have 5000 API requests/hour limit
	EOF
	}

	# About function
	show_about() {
	cat << EOF
	find_github_forks_with_file - GitHub Fork File Finder

	This tool searches for a specific file across all forks of a GitHub repository.
	It was originally designed to find flake.nix files in Nix projects, but can
	search for any file.

	FEATURES:
	- Parallel processing (10 concurrent jobs by default)
	- Live progress bar using efficient filesystem metadata
	- API rate limit warnings for large repositories
	- Proper error handling and progress feedback
	- Constructs direct URLs to found files using correct default branch

	DEPENDENCIES:
	- gh (GitHub CLI): For accessing GitHub API
	- A compatible 'stat' command (standard on macOS and Linux)

	TECHNICAL DETAILS:
	- Uses GitHub API via 'gh' CLI tool
	- Handles command line length limits with xargs -s
	- Uses an atomic, dependency-free method for progress tracking:
	Each parallel worker appends a single byte to a temporary file upon completion.
	The main script polls the size of this file using 'stat' (a fast metadata
	operation) to update the progress bar without race conditions.
	- Only fetches default branch info when files are found (efficient)

	AUTHOR: Built for efficient Nix ecosystem exploration
	EOF
	}

	# Dependency check function
	check_dependencies() {
	# Check for gh CLI
	if ! command -v gh >/dev/null 2>&1; then
	echo "Error: 'gh' (GitHub CLI) is required but not installed." >&2
	echo "Please install it from: https://cli.github.com/" >&2
	return 1
	fi
	return 0
	}

	# Determine the correct stat command for file size
	STAT_CMD=""
	# Check for gstat (GNU stat on macOS via coreutils) first for consistency
	if command -v gstat >/dev/null 2>&1; then
	STAT_CMD="gstat -c %s"
	# Check if the system 'stat' supports GNU format
	elif stat -c %s /dev/null >/dev/null 2>&1; then
	STAT_CMD="stat -c %s"
	# Check if the system 'stat' supports BSD format
	elif stat -f %z /dev/null >/dev/null 2>&1; then
	STAT_CMD="stat -f %z"
	else
	echo "Error: Cannot find a compatible 'stat' command to determine file size." >&2
	echo "Please install GNU coreutils ('brew install coreutils' on macOS)." >&2
	exit 4 # dependency error
	fi

	# Parse command line arguments
	while [[ $# -gt 0 ]]; do
	case $1 in
	--file)
	FILE_TO_SEARCH="$2"
	shift 2
	;;
	-h\|--help)
	show_help
	exit 0
	;;
	--about)
	show_about
	exit 0
	;;
	-*)
	echo "Unknown option: $1" >&2
	show_help >&2
	exit 2
	;;
	*)
	if [[ -z "$REPO" ]]; then
	REPO="$1"
	else
	echo "Too many arguments: $1" >&2
	show_help >&2
	exit 2
	fi
	shift
	;;
	esac
	done

	# Check all dependencies
	if ! check_dependencies; then
	exit 1 # dependency check failed
	fi

	if [[ -z "$REPO" ]]; then
	echo "Error: Repository argument required" >&2
	show_help >&2
	exit 2 # usage error
	fi

	echo "Fetching forks of $REPO..."

	# Get all forks into array (in-memory)
	fork_data=$(gh api "repos/$REPO/forks" --paginate --jq '.[].full_name')
	if [[ $? -ne 0 ]]; then
	echo -e "\033[31mError fetching forks\033[0m" >&2
	exit 3 # error fetching forks
	fi
	readarray -t forks <<< "$fork_data"
	total_forks=${#forks[@]}

	if [[ $total_forks -eq 0 ]]; then
	echo "No forks found for $REPO."
	exit 0
	fi

	echo "Found $total_forks forks. Checking for $FILE_TO_SEARCH in parallel (max $MAX_JOBS jobs)..."

	# Warn about API limits for large fork counts
	if [[ $total_forks -gt 1000 ]]; then
	echo -e "\033[33mWarning: Checking $total_forks forks will make ~$((total_forks * 2)) API requests, worst-case.\033[0m"
	echo -e "\033[33mGitHub's rate limit is 5000 requests/hour for authenticated users.\033[0m"
	fi

	# Create temporary files for progress and results in /tmp (for ramdisk performance)
	progress_file="/tmp/find_forks_progress_$$"
	results_file="/tmp/find_forks_results_$$"
	# Ensure the temporary files exist before use
	touch "$progress_file" "$results_file"

	# Cleanup temp files on exit
	trap 'rm -f "$progress_file" "$results_file"' EXIT

	# Function to check a single fork and output result
	check_fork() {
	local fork="$1"
	local file_to_search="$2"
	local results_file_path="$3"
	local progress_file_path="$4"

	# Try to get the specified file directly from the repo
	local api_response
	if api_response=$(gh api "repos/$fork/contents/$file_to_search" 2>/dev/null); then
	# Verify the response contains actual file data and has the right type
	if [[ -n "$api_response" ]] && echo "$api_response" \| jq -e '.type == "file"' >/dev/null 2>&1; then
	# Get default branch
	default_branch=$(gh api "repos/$fork" --jq '.default_branch')
	url="https://github.com/$fork/blob/$default_branch/$file_to_search"
	# Output result to the results file
	echo "$fork\|$url" >> "$results_file_path"
	fi
	fi
	# Signal completion by appending a single byte to the progress file
	echo -n "." >> "$progress_file_path"
	}

	# Export function and variables for parallel execution
	export -f check_fork
	export FILE_TO_SEARCH
	export STAT_CMD

	# Run checks in parallel in the background, redirecting stdout/stderr of the sub-process
	printf '%s\n' "${forks[@]}" \| xargs -s "$(getconf ARG_MAX)" -n 1 -P "$MAX_JOBS" -I {} bash -c 'check_fork "$@" >/dev/null 2>&1' _ {} "$FILE_TO_SEARCH" "$results_file" "$progress_file" &
	xargs_pid=$!

	# Progress bar
	echo # newline before progress bar
	while kill -0 "$xargs_pid" 2>/dev/null; do
	completed_count=$($STAT_CMD "$progress_file" 2>/dev/null \|\| echo 0)
	# Ensure completed_count is a number, default to 0 if not
	[[ "$completed_count" =~ ^[0-9]+$ ]] \|\| completed_count=0

	# Avoid division by zero
	if [[ $total_forks -gt 0 ]]; then
	percent=$((completed_count * 100 / total_forks))
	else
	percent=100
	fi

	bar_len=40
	filled_len=$((bar_len * percent / 100))
	bar=$(printf "%${filled_len}s" \| tr ' ' '#')
	empty=$(printf "%$(($bar_len - filled_len))s")

	# \r moves cursor to beginning of line, -n prevents newline
	printf "\rProgress: [%s%s] %d%% (%d/%d) " "$bar" "$empty" "$percent" "$completed_count" "$total_forks"
	sleep 0.2
	done

	# Ensure the progress bar shows 100% at the end and move to the next line
	printf "\rProgress: [%s] 100%% (%d/%d)\n" "$(printf "%${bar_len}s" \| tr ' ' '#')" "$total_forks" "$total_forks"
	echo

	# Read results from the file
	readarray -t results < "$results_file"

	echo "Search complete!"
	echo

	# Output summary
	if [[ ${#results[@]} -gt 0 ]]; then
	echo "Found $FILE_TO_SEARCH in ${#results[@]} out of $total_forks forks:"
	echo

	for result in "${results[@]}"; do
	IFS='\|' read -r fork_name url <<< "$result"
	echo "✓ $fork_name"
	echo " $url"
	echo
	done
	else
	echo -e "\033[31mNo forks with $FILE_TO_SEARCH found.\033[0m"
	fi