Last active
January 19, 2025 01:20
-
-
Save Xopoko/43233818f296850a70a7ae9576604a20 to your computer and use it in GitHub Desktop.
This script collects source files from a specified directory (recursively), excludes certain folders, and combines their content into a single output file. It's designed for sharing project code with LLMs efficiently by copying the output to the clipboard.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import os | |
import sys | |
import fnmatch | |
import argparse | |
import pyperclip | |
import re | |
# Make sure you have tiktoken installed if you intend to use --calculate-tokens | |
try: | |
import tiktoken | |
TIKTOKEN_AVAILABLE = True | |
except ImportError: | |
TIKTOKEN_AVAILABLE = False | |
PROMPT_TEXT = """ | |
<system prompt> | |
YOU ARE AN EXPERT SOFTWARE DEVELOPER AND ARCHITECT WITH MASTER-LEVEL COMMAND OF INDUSTRY BEST PRACTICES. YOUR TASK IS TO GENERATE OPTIMIZED, PRODUCTION-READY CODE IN RESPONSE TO THE GIVEN TASK, PRODUCING FINAL CODE LISTINGS WITH NO EXPLANATIONS. ALL RESPONSES MUST ADHERE TO THE FOLLOWING: | |
<instructions> | |
- ONLY PROVIDE CODE LISTINGS FOR CHANGED FILES, OMITTING UNCHANGED FILES ENTIRELY. | |
- ENSURE THE CODE IS ERROR-FREE, HIGHLY OPTIMIZED, AND COMPLIANT WITH THE TASK SPECIFICATIONS. | |
- FOLLOW ESTABLISHED STANDARDS AND CONVENTIONS FOR THE SPECIFIC LANGUAGE OR FRAMEWORK. | |
- PROVIDE FULLY FUNCTIONAL CODE SEGMENTS THAT ARE IMMEDIATELY USABLE IN THE CONTEXT OF THE TASK. | |
<what not to do> | |
- DO NOT INCLUDE EXPLANATIONS, COMMENTS, OR NON-CODE TEXT IN YOUR RESPONSES. | |
- DO NOT LIST FILES THAT ARE UNCHANGED OR OUTSIDE THE SCOPE OF THE TASK. | |
- DO NOT PRODUCE INCOMPLETE OR NON-FUNCTIONAL CODE. | |
- NEVER DEVIATE FROM THE LANGUAGE, LIBRARY, OR FRAMEWORK SPECIFIED IN THE TASK. | |
<TASK> | |
""" | |
def remove_comments(content: str) -> str: | |
""" | |
Removes Python (# ...), C/Java/C++/Swift single-line (// ...), | |
and block comments (/* ... */) from the given content. | |
It also removes trailing whitespace and leftover blank lines. | |
""" | |
content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL) | |
content = re.sub(r'//.*', '', content) | |
content = re.sub(r'#.*', '', content) | |
content = re.sub(r'[ \t]+$', '', content, flags=re.MULTILINE) | |
content = re.sub(r'^\s*\n', '', content, flags=re.MULTILINE) | |
return content | |
def remove_header_comments(content: str) -> str: | |
""" | |
Removes only 'header' comments at the beginning of the file | |
until the first real (non-comment) line is reached. | |
""" | |
lines = content.split('\n') | |
new_lines = [] | |
removing = True | |
block_comment_open = False | |
for line in lines: | |
stripped = line.strip() | |
if removing: | |
if block_comment_open: | |
if '*/' in stripped: | |
block_comment_open = False | |
continue | |
if stripped.startswith('/*'): | |
block_comment_open = True | |
if '*/' in stripped: | |
block_comment_open = False | |
continue | |
if stripped.startswith('//') or stripped.startswith('#') or stripped == '': | |
continue | |
removing = False | |
new_lines.append(line) | |
else: | |
new_lines.append(line) | |
return '\n'.join(new_lines) | |
def should_exclude(full_relative_path: str, exclude_patterns: list) -> bool: | |
""" | |
Returns True if the given path matches any exclusion pattern. | |
""" | |
for pattern in exclude_patterns: | |
if fnmatch.fnmatch(full_relative_path, pattern): | |
return True | |
return False | |
def should_include(full_relative_path: str, include_patterns: list) -> bool: | |
""" | |
Returns True if the given path matches at least one inclusion pattern. | |
""" | |
if not include_patterns: | |
return True | |
for pattern in include_patterns: | |
if fnmatch.fnmatch(full_relative_path, pattern): | |
return True | |
return False | |
def get_directory_tree( | |
root_directory: str, | |
current_path: str, | |
prefix: str = "", | |
exclude_list: list = None, | |
include_list: list = None, | |
extensions: list = None | |
) -> str: | |
""" | |
Builds an ASCII-style directory tree string for the provided path, | |
applying exclude/include filters and extension checks. | |
""" | |
if exclude_list is None: | |
exclude_list = [] | |
if include_list is None: | |
include_list = [] | |
if extensions is None: | |
extensions = [] | |
if not os.path.isdir(current_path): | |
return "" | |
entries = sorted(os.listdir(current_path)) | |
tree_string = "" | |
valid_children = [] | |
rel_dir = os.path.relpath(current_path, root_directory).replace("\\", "/") | |
if should_exclude(rel_dir, exclude_list): | |
return "" | |
for entry in entries: | |
full_path = os.path.join(current_path, entry) | |
rel_path = os.path.relpath(full_path, root_directory).replace("\\", "/") | |
if should_exclude(rel_path, exclude_list): | |
continue | |
if os.path.isdir(full_path): | |
subtree = get_directory_tree( | |
root_directory, | |
full_path, | |
prefix="", | |
exclude_list=exclude_list, | |
include_list=include_list, | |
extensions=extensions | |
) | |
if subtree.strip(): | |
valid_children.append((entry, True, subtree)) | |
else: | |
if not should_include(rel_path, include_list): | |
continue | |
if extensions and not any(entry.endswith(ext) for ext in extensions): | |
continue | |
valid_children.append((entry, False, "")) | |
if not valid_children: | |
return "" | |
total = len(valid_children) | |
for index, (name, is_dir, subtree_str) in enumerate(valid_children): | |
connector = "├── " if index < total - 1 else "└── " | |
tree_string += prefix + connector + name + "\n" | |
if is_dir: | |
extension_prefix = prefix + ("│ " if index < total - 1 else " ") | |
subtree_lines = subtree_str.splitlines(True) | |
for i, subline in enumerate(subtree_lines): | |
subtree_lines[i] = extension_prefix + subline | |
tree_string += "".join(subtree_lines) | |
return tree_string | |
def format_file_content(relative_path: str, content: str, format_style: str) -> str: | |
""" | |
Formats file content according to the specified style (none, md, pretty, xml). | |
""" | |
if format_style == "none": | |
return content + "\n\n" | |
if format_style == "md": | |
return f"{relative_path}:\n```\n{content}\n```\n\n" | |
if format_style == "pretty": | |
lines = content.splitlines() | |
header = f"/{relative_path}:\n" + ("-" * 80) + "\n" | |
numbered_lines = [f"{i} | {line}" for i, line in enumerate(lines, start=1)] | |
body = "\n".join(numbered_lines) + "\n" | |
footer = "\n" + ("-" * 80) + "\n\n" | |
return header + body + footer | |
if format_style == "xml": | |
return f"<{relative_path}>\n{content}\n" | |
return f"{relative_path}:\n```\n{content}\n```\n\n" | |
def parse_gitignore(gitignore_path: str) -> list: | |
""" | |
Parses .gitignore to create a list of patterns. | |
""" | |
patterns = [] | |
if not os.path.isfile(gitignore_path): | |
return patterns | |
try: | |
with open(gitignore_path, "r", encoding="utf-8") as f: | |
for line in f: | |
line = line.strip() | |
if not line or line.startswith("#"): | |
continue | |
patterns.append(line) | |
except Exception as e: | |
print(f"[WARNING] Could not parse .gitignore: {e}") | |
return patterns | |
def collect_files( | |
directory: str, | |
output_file: str = None, | |
extensions: list = None, | |
exclude_list: list = None, | |
include_list: list = None, | |
remove_comments_flag: bool = False, | |
remove_header_comments_flag: bool = False, | |
add_file_structure_flag: bool = False, | |
format_style: str = "md", | |
calculate_tokens_flag: bool = False, | |
prepend_prompt_flag: bool = False | |
) -> None: | |
""" | |
Collects code from the specified directory, optionally removing comments, | |
applying exclude/include filters, building an ASCII tree, and copying to clipboard. | |
""" | |
if exclude_list is None: | |
exclude_list = [] | |
if include_list is None: | |
include_list = [] | |
if extensions is None: | |
extensions = [] | |
directory = os.path.abspath(directory) | |
print(f"[INFO] Directory to scan: {directory}") | |
print(f"[INFO] Excluded patterns: {exclude_list}") | |
print(f"[INFO] Included patterns: {include_list}") | |
print(f"[INFO] File extensions to include (empty => all files): {extensions}") | |
print(f"[INFO] Remove all comments: {remove_comments_flag}") | |
print(f"[INFO] Remove header comments only: {remove_header_comments_flag}") | |
print(f"[INFO] Add file structure at start: {add_file_structure_flag}") | |
print(f"[INFO] Format style: {format_style}") | |
print(f"[INFO] Calculate tokens: {calculate_tokens_flag}") | |
print(f"[INFO] Prepend prompt: {prepend_prompt_flag}") | |
file_structure_content = "" | |
if add_file_structure_flag: | |
tree = get_directory_tree( | |
root_directory=directory, | |
current_path=directory, | |
exclude_list=exclude_list, | |
include_list=include_list, | |
extensions=extensions | |
) | |
if tree.strip(): | |
file_structure_content = tree + "\n" | |
if output_file: | |
output_file = os.path.abspath(output_file) | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write("") | |
print(f"[INFO] Output file {output_file} cleared.") | |
else: | |
print("[INFO] No output file specified, results will only be copied to clipboard.") | |
collected_file_blocks = [] | |
for root, dirs, files in os.walk(directory): | |
new_dirs = [] | |
for d in dirs: | |
rel_path_d = os.path.relpath(os.path.join(root, d), directory).replace("\\", "/") | |
if not should_exclude(rel_path_d, exclude_list): | |
new_dirs.append(d) | |
dirs[:] = new_dirs | |
for file in files: | |
full_path_f = os.path.join(root, file) | |
rel_path_f = os.path.relpath(full_path_f, directory).replace("\\", "/") | |
if should_exclude(rel_path_f, exclude_list): | |
continue | |
if not should_include(rel_path_f, include_list): | |
continue | |
if extensions and not any(file.endswith(ext) for ext in extensions): | |
continue | |
print(f"[INFO] Found file: {rel_path_f}") | |
try: | |
with open(full_path_f, "r", encoding="utf-8") as f: | |
content = f.read() | |
if remove_header_comments_flag: | |
content = remove_header_comments(content) | |
if remove_comments_flag: | |
content = remove_comments(content) | |
formatted_block = format_file_content(rel_path_f, content, format_style) | |
collected_file_blocks.append(formatted_block) | |
except Exception as e: | |
print(f"[ERROR] Failed to read file {full_path_f}: {e}") | |
final_content = file_structure_content + ''.join(collected_file_blocks) | |
if prepend_prompt_flag: | |
final_content = PROMPT_TEXT + "\n\n" + final_content | |
if output_file: | |
try: | |
with open(output_file, "a", encoding="utf-8") as out: | |
out.write(final_content) | |
print(f"[INFO] Collected file content written to {output_file}.") | |
except Exception as e: | |
print(f"[ERROR] Error writing to output file {output_file}: {e}") | |
pyperclip.copy(final_content) | |
print("[INFO] Collected file content copied to the clipboard.") | |
if calculate_tokens_flag: | |
if not TIKTOKEN_AVAILABLE: | |
print("[WARNING] tiktoken not installed, cannot calculate tokens.") | |
else: | |
try: | |
enc = tiktoken.encoding_for_model("gpt-4o") | |
tokens = enc.encode(final_content) | |
print(f"[INFO] The final content uses {len(tokens)} tokens.") | |
except Exception as e: | |
print(f"[WARNING] Failed to calculate token count: {e}") | |
def main() -> None: | |
""" | |
Entry point of the script, setting up argument parsing and orchestrating | |
the file collection process. | |
""" | |
parser = argparse.ArgumentParser( | |
description="Collect code from files with given extensions, optionally remove comments," | |
" exclude or include certain paths, show file structure, and put the result" | |
" into clipboard (and optionally a file)." | |
) | |
parser.add_argument("directory", help="Directory to scan") | |
parser.add_argument("--output", default=None, help="Output file to save the result.") | |
parser.add_argument( | |
"--filetypes", | |
help="Comma-separated list of file extensions to include (e.g. .swift,.py). If not provided, all filetypes are included.", | |
default="" | |
) | |
parser.add_argument( | |
"--exclude", | |
help="Comma-separated list of glob patterns to exclude (files/folders). e.g. **/dist/**,.git,node_modules,*.md", | |
default=".git,node_modules" | |
) | |
parser.add_argument( | |
"--include", | |
help="Comma-separated list of glob patterns to specifically include (files). e.g. Package.swift,Project.swift", | |
default="" | |
) | |
parser.add_argument("--remove-comments", action="store_true", help="Remove all comments.") | |
parser.add_argument("--remove-header-comments", action="store_true", help="Remove only header comments.") | |
parser.add_argument("--add-file-structure", action="store_true", help="Add an ASCII-style file tree at the start.") | |
parser.add_argument( | |
"--format", | |
choices=["none", "md", "pretty", "xml"], | |
default="md", | |
help="Output format: none, md, pretty, or xml. Defaults to md." | |
) | |
parser.add_argument( | |
"--calculate-tokens", | |
action="store_true", | |
help="Calculate and display the number of tokens (requires tiktoken)." | |
) | |
parser.add_argument( | |
"--use-gitignore", | |
action="store_true", | |
help="If set, parse .gitignore (if present) and apply those patterns to exclude." | |
) | |
parser.add_argument( | |
"--prepend-prompt", | |
action="store_true", | |
help="Add the user prompt to the start of the result text." | |
) | |
args = parser.parse_args() | |
directory = args.directory | |
output_file = args.output | |
filetypes_str = args.filetypes.strip() | |
extensions = [ext.strip() for ext in filetypes_str.split(",")] if filetypes_str else [] | |
exclude_list = [e.strip() for e in args.exclude.split(",")] if args.exclude else [] | |
include_list = [i.strip() for i in args.include.split(",")] if args.include else [] | |
if args.use_gitignore: | |
gitignore_path = os.path.join(directory, ".gitignore") | |
gitignore_patterns = parse_gitignore(gitignore_path) | |
if gitignore_patterns: | |
print("[INFO] .gitignore found and parsed. Adding patterns to exclude_list.") | |
exclude_list.extend(gitignore_patterns) | |
else: | |
print("[INFO] .gitignore not found or empty, proceeding without .gitignore patterns.") | |
collect_files( | |
directory=directory, | |
output_file=output_file, | |
extensions=extensions, | |
exclude_list=exclude_list, | |
include_list=include_list, | |
remove_comments_flag=args.remove_comments, | |
remove_header_comments_flag=args.remove_header_comments, | |
add_file_structure_flag=args.add_file_structure, | |
format_style=args.format, | |
calculate_tokens_flag=args.calculate_tokens, | |
prepend_prompt_flag=args.prepend_prompt | |
) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment