Created
August 28, 2024 20:51
-
-
Save BowTiedSwan/8ba1c4fcbf7d221eecf72f26e4f0d98d to your computer and use it in GitHub Desktop.
Repo to text file for LLM consumption
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from tqdm import tqdm | |
def get_readme_content(repo_path): | |
""" | |
Retrieve the content of the README file. | |
""" | |
readme_path = os.path.join(repo_path, "README.md") | |
try: | |
with open(readme_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
except FileNotFoundError: | |
return "README not found." | |
def traverse_repo_iteratively(repo_path): | |
""" | |
Traverse the repository iteratively to avoid recursion limits for large repositories. | |
""" | |
structure = "" | |
dirs_to_visit = [(repo_path, "")] | |
dirs_visited = set() | |
while dirs_to_visit: | |
current_path, relative_path = dirs_to_visit.pop() | |
dirs_visited.add(current_path) | |
for entry in tqdm(os.listdir(current_path), desc=f"Processing {relative_path}", leave=False): | |
if entry in ['venv', '.git', '.idea', 'repototext.py', 'node_modules', '.next', '.vscode', 'yarn.lock', '/yarn.lock', '.yarn.lock', './yarn.lock']: | |
continue # Ignore the 'venv', '.git' directories and 'repototext.py' file | |
full_path = os.path.join(current_path, entry) | |
if os.path.isdir(full_path): | |
if full_path not in dirs_visited: | |
structure += f"{relative_path}/{entry}/\n" | |
dirs_to_visit.append((full_path, f"{relative_path}/{entry}")) | |
else: | |
if entry == '.env': | |
continue # Ignore the '.env' file | |
structure += f"{relative_path}/{entry}\n" | |
return structure | |
def get_file_contents_iteratively(repo_path): | |
file_contents = "" | |
dirs_to_visit = [(repo_path, "")] | |
dirs_visited = set() | |
binary_extensions = [ | |
# Compiled executables and libraries | |
'.exe', '.dll', '.so', '.a', '.lib', '.dylib', '.o', '.obj', | |
# Compressed archives | |
'.zip', '.tar', '.tar.gz', '.tgz', '.rar', '.7z', '.bz2', '.gz', '.xz', '.z', '.lz', '.lzma', '.lzo', '.rz', '.sz', '.dz', | |
# Application-specific files | |
'.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ods', '.odp', | |
# Media files (less common) | |
'.png', '.jpg', '.jpeg', '.gif', '.mp3', '.mp4', '.wav', '.flac', '.ogg', '.avi', '.mkv', '.mov', '.webm', '.wmv', '.m4a', '.aac', | |
# Virtual machine and container images | |
'.iso', '.vmdk', '.qcow2', '.vdi', '.vhd', '.vhdx', '.ova', '.ovf', | |
# Database files | |
'.db', '.sqlite', '.mdb', '.accdb', '.frm', '.ibd', '.dbf', | |
# Java-related files | |
'.jar', '.class', '.war', '.ear', '.jpi', | |
# Python bytecode and packages | |
'.pyc', '.pyo', '.pyd', '.egg', '.whl', | |
# Other potentially important extensions | |
'.deb', '.rpm', '.apk', '.msi', '.dmg', '.pkg', '.bin', '.dat', '.data', | |
'.dump', '.img', '.toast', '.vcd', '.crx', '.xpi', '.lockb', 'package-lock.json', '.svg' , | |
'.eot', '.otf', '.ttf', '.woff', '.woff2', | |
'.ico', '.icns', '.cur', | |
'.cab', '.dmp', '.msp', '.msm', | |
'.keystore', '.jks', '.truststore', '.cer', '.crt', '.der', '.p7b', '.p7c', '.p12', '.pfx', '.pem', '.csr', | |
'.key', '.pub', '.sig', '.pgp', '.gpg', | |
'.nupkg', '.snupkg', '.appx', '.msix', '.msp', '.msu', | |
'.deb', '.rpm', '.snap', '.flatpak', '.appimage', | |
'.ko', '.sys', '.elf', | |
'.swf', '.fla', '.swc', | |
'.rlib', '.pdb', '.idb', '.pdb', '.dbg', | |
'.sdf', '.bak', '.tmp', '.temp', '.log', '.tlog', '.ilk', | |
'.bpl', '.dcu', '.dcp', '.dcpil', '.drc', | |
'.aps', '.res', '.rsrc', '.rc', '.resx', | |
'.prefs', '.properties', '.ini', '.cfg', '.config', '.conf', | |
'.DS_Store', '.localized', '.svn', '.git', '.gitignore', '.gitkeep', | |
] | |
while dirs_to_visit: | |
current_path, relative_path = dirs_to_visit.pop() | |
dirs_visited.add(current_path) | |
for entry in tqdm(os.listdir(current_path), desc=f"Downloading {relative_path}", leave=False): | |
if entry in ['venv', '.git', 'repototext.py', 'yarn.lock', 'node_modules', '.next', '.vscode', '/yarn.lock']: | |
continue # Ignore the 'venv', '.git' directories and 'repototext.py' file | |
full_path = os.path.join(current_path, entry) | |
if os.path.isdir(full_path): | |
if full_path not in dirs_visited: | |
dirs_to_visit.append((full_path, f"{relative_path}/{entry}")) | |
else: | |
if entry == '.env': | |
continue # Ignore the '.env' file | |
if any(entry.endswith(ext) for ext in binary_extensions): | |
file_contents += f"File: {relative_path}/{entry}\nContent: Skipped binary file\n\n" | |
else: | |
file_contents += f"File: {relative_path}/{entry}\n" | |
try: | |
with open(full_path, 'r', encoding='utf-8') as file: | |
file_contents += "Content:\n" + file.read() + "\n\n" | |
except UnicodeDecodeError: | |
file_contents += "Content: Skipped due to unsupported encoding or binary content\n\n" | |
return file_contents | |
def get_local_repo_contents(repo_path): | |
""" | |
Main function to get local repository contents. | |
""" | |
repo_name = os.path.basename(repo_path.rstrip(os.sep)) | |
print(f"Fetching README for: {repo_name}") | |
readme_content = get_readme_content(repo_path) | |
print(f"\nFetching repository structure for: {repo_name}") | |
repo_structure = f"Repository Structure: {repo_name}\n" | |
repo_structure += traverse_repo_iteratively(repo_path) | |
print(f"\nFetching file contents for: {repo_name}") | |
file_contents = get_file_contents_iteratively(repo_path) | |
instructions = f"Prompt: Analyze the {repo_name} repository to understand its structure, purpose, and functionality. Follow these steps to study the codebase:\n\n" | |
instructions += "1. Read the README file to gain an overview of the project, its goals, and any setup instructions.\n\n" | |
instructions += "2. Examine the repository structure to understand how the files and directories are organized.\n\n" | |
instructions += "3. Identify the main entry point of the application (e.g., main.py, app.py, index.js) and start analyzing the code flow from there.\n\n" | |
instructions += "4. Study the dependencies and libraries used in the project to understand the external tools and frameworks being utilized.\n\n" | |
instructions += "5. Analyze the core functionality of the project by examining the key modules, classes, and functions.\n\n" | |
instructions += "6. Look for any configuration files (e.g., config.py, .env) to understand how the project is configured and what settings are available.\n\n" | |
instructions += "7. Investigate any tests or test directories to see how the project ensures code quality and handles different scenarios.\n\n" | |
instructions += "8. Review any documentation or inline comments to gather insights into the codebase and its intended behavior.\n\n" | |
instructions += "9. Identify any potential areas for improvement, optimization, or further exploration based on your analysis.\n\n" | |
instructions += "10. Provide a summary of your findings, including the project's purpose, key features, and any notable observations or recommendations.\n\n" | |
instructions += "Use the files and contents provided below to complete this analysis:\n\n" | |
return repo_name, instructions, readme_content, repo_structure, file_contents | |
if __name__ == '__main__': | |
repo_path = input("Please enter the path to the local repository: ") | |
try: | |
repo_name, instructions, readme_content, repo_structure, file_contents = get_local_repo_contents(repo_path) | |
output_filename = f'{repo_name}_contents.txt' | |
with open(output_filename, 'w', encoding='utf-8') as f: | |
f.write(instructions) | |
f.write(f"README:\n{readme_content}\n\n") | |
f.write(repo_structure) | |
f.write('\n\n') | |
f.write(file_contents) | |
print(f"Local repository contents saved to '{output_filename}'.") | |
except Exception as e: | |
print(f"An error occurred: {e}") | |
print("Please check the path to the local repository and try again.") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment