Skip to content

Instantly share code, notes, and snippets.

@BowTiedSwan
Created August 28, 2024 20:51
Show Gist options
  • Save BowTiedSwan/8ba1c4fcbf7d221eecf72f26e4f0d98d to your computer and use it in GitHub Desktop.
Save BowTiedSwan/8ba1c4fcbf7d221eecf72f26e4f0d98d to your computer and use it in GitHub Desktop.
Repo to text file for LLM consumption
import os
from tqdm import tqdm
def get_readme_content(repo_path):
"""
Retrieve the content of the README file.
"""
readme_path = os.path.join(repo_path, "README.md")
try:
with open(readme_path, 'r', encoding='utf-8') as file:
return file.read()
except FileNotFoundError:
return "README not found."
def traverse_repo_iteratively(repo_path):
"""
Traverse the repository iteratively to avoid recursion limits for large repositories.
"""
structure = ""
dirs_to_visit = [(repo_path, "")]
dirs_visited = set()
while dirs_to_visit:
current_path, relative_path = dirs_to_visit.pop()
dirs_visited.add(current_path)
for entry in tqdm(os.listdir(current_path), desc=f"Processing {relative_path}", leave=False):
if entry in ['venv', '.git', '.idea', 'repototext.py', 'node_modules', '.next', '.vscode', 'yarn.lock', '/yarn.lock', '.yarn.lock', './yarn.lock']:
continue # Ignore the 'venv', '.git' directories and 'repototext.py' file
full_path = os.path.join(current_path, entry)
if os.path.isdir(full_path):
if full_path not in dirs_visited:
structure += f"{relative_path}/{entry}/\n"
dirs_to_visit.append((full_path, f"{relative_path}/{entry}"))
else:
if entry == '.env':
continue # Ignore the '.env' file
structure += f"{relative_path}/{entry}\n"
return structure
def get_file_contents_iteratively(repo_path):
file_contents = ""
dirs_to_visit = [(repo_path, "")]
dirs_visited = set()
binary_extensions = [
# Compiled executables and libraries
'.exe', '.dll', '.so', '.a', '.lib', '.dylib', '.o', '.obj',
# Compressed archives
'.zip', '.tar', '.tar.gz', '.tgz', '.rar', '.7z', '.bz2', '.gz', '.xz', '.z', '.lz', '.lzma', '.lzo', '.rz', '.sz', '.dz',
# Application-specific files
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp',
# Media files (less common)
'.png', '.jpg', '.jpeg', '.gif', '.mp3', '.mp4', '.wav', '.flac', '.ogg', '.avi', '.mkv', '.mov', '.webm', '.wmv', '.m4a', '.aac',
# Virtual machine and container images
'.iso', '.vmdk', '.qcow2', '.vdi', '.vhd', '.vhdx', '.ova', '.ovf',
# Database files
'.db', '.sqlite', '.mdb', '.accdb', '.frm', '.ibd', '.dbf',
# Java-related files
'.jar', '.class', '.war', '.ear', '.jpi',
# Python bytecode and packages
'.pyc', '.pyo', '.pyd', '.egg', '.whl',
# Other potentially important extensions
'.deb', '.rpm', '.apk', '.msi', '.dmg', '.pkg', '.bin', '.dat', '.data',
'.dump', '.img', '.toast', '.vcd', '.crx', '.xpi', '.lockb', 'package-lock.json', '.svg' ,
'.eot', '.otf', '.ttf', '.woff', '.woff2',
'.ico', '.icns', '.cur',
'.cab', '.dmp', '.msp', '.msm',
'.keystore', '.jks', '.truststore', '.cer', '.crt', '.der', '.p7b', '.p7c', '.p12', '.pfx', '.pem', '.csr',
'.key', '.pub', '.sig', '.pgp', '.gpg',
'.nupkg', '.snupkg', '.appx', '.msix', '.msp', '.msu',
'.deb', '.rpm', '.snap', '.flatpak', '.appimage',
'.ko', '.sys', '.elf',
'.swf', '.fla', '.swc',
'.rlib', '.pdb', '.idb', '.pdb', '.dbg',
'.sdf', '.bak', '.tmp', '.temp', '.log', '.tlog', '.ilk',
'.bpl', '.dcu', '.dcp', '.dcpil', '.drc',
'.aps', '.res', '.rsrc', '.rc', '.resx',
'.prefs', '.properties', '.ini', '.cfg', '.config', '.conf',
'.DS_Store', '.localized', '.svn', '.git', '.gitignore', '.gitkeep',
]
while dirs_to_visit:
current_path, relative_path = dirs_to_visit.pop()
dirs_visited.add(current_path)
for entry in tqdm(os.listdir(current_path), desc=f"Downloading {relative_path}", leave=False):
if entry in ['venv', '.git', 'repototext.py', 'yarn.lock', 'node_modules', '.next', '.vscode', '/yarn.lock']:
continue # Ignore the 'venv', '.git' directories and 'repototext.py' file
full_path = os.path.join(current_path, entry)
if os.path.isdir(full_path):
if full_path not in dirs_visited:
dirs_to_visit.append((full_path, f"{relative_path}/{entry}"))
else:
if entry == '.env':
continue # Ignore the '.env' file
if any(entry.endswith(ext) for ext in binary_extensions):
file_contents += f"File: {relative_path}/{entry}\nContent: Skipped binary file\n\n"
else:
file_contents += f"File: {relative_path}/{entry}\n"
try:
with open(full_path, 'r', encoding='utf-8') as file:
file_contents += "Content:\n" + file.read() + "\n\n"
except UnicodeDecodeError:
file_contents += "Content: Skipped due to unsupported encoding or binary content\n\n"
return file_contents
def get_local_repo_contents(repo_path):
"""
Main function to get local repository contents.
"""
repo_name = os.path.basename(repo_path.rstrip(os.sep))
print(f"Fetching README for: {repo_name}")
readme_content = get_readme_content(repo_path)
print(f"\nFetching repository structure for: {repo_name}")
repo_structure = f"Repository Structure: {repo_name}\n"
repo_structure += traverse_repo_iteratively(repo_path)
print(f"\nFetching file contents for: {repo_name}")
file_contents = get_file_contents_iteratively(repo_path)
instructions = f"Prompt: Analyze the {repo_name} repository to understand its structure, purpose, and functionality. Follow these steps to study the codebase:\n\n"
instructions += "1. Read the README file to gain an overview of the project, its goals, and any setup instructions.\n\n"
instructions += "2. Examine the repository structure to understand how the files and directories are organized.\n\n"
instructions += "3. Identify the main entry point of the application (e.g., main.py, app.py, index.js) and start analyzing the code flow from there.\n\n"
instructions += "4. Study the dependencies and libraries used in the project to understand the external tools and frameworks being utilized.\n\n"
instructions += "5. Analyze the core functionality of the project by examining the key modules, classes, and functions.\n\n"
instructions += "6. Look for any configuration files (e.g., config.py, .env) to understand how the project is configured and what settings are available.\n\n"
instructions += "7. Investigate any tests or test directories to see how the project ensures code quality and handles different scenarios.\n\n"
instructions += "8. Review any documentation or inline comments to gather insights into the codebase and its intended behavior.\n\n"
instructions += "9. Identify any potential areas for improvement, optimization, or further exploration based on your analysis.\n\n"
instructions += "10. Provide a summary of your findings, including the project's purpose, key features, and any notable observations or recommendations.\n\n"
instructions += "Use the files and contents provided below to complete this analysis:\n\n"
return repo_name, instructions, readme_content, repo_structure, file_contents
if __name__ == '__main__':
repo_path = input("Please enter the path to the local repository: ")
try:
repo_name, instructions, readme_content, repo_structure, file_contents = get_local_repo_contents(repo_path)
output_filename = f'{repo_name}_contents.txt'
with open(output_filename, 'w', encoding='utf-8') as f:
f.write(instructions)
f.write(f"README:\n{readme_content}\n\n")
f.write(repo_structure)
f.write('\n\n')
f.write(file_contents)
print(f"Local repository contents saved to '{output_filename}'.")
except Exception as e:
print(f"An error occurred: {e}")
print("Please check the path to the local repository and try again.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment