Skip to content

Instantly share code, notes, and snippets.

@Xopoko
Last active January 19, 2025 01:20
Show Gist options
  • Save Xopoko/43233818f296850a70a7ae9576604a20 to your computer and use it in GitHub Desktop.
Save Xopoko/43233818f296850a70a7ae9576604a20 to your computer and use it in GitHub Desktop.
This script collects source files from a specified directory (recursively), excludes certain folders, and combines their content into a single output file. It's designed for sharing project code with LLMs efficiently by copying the output to the clipboard.
#!/usr/bin/env python3
import os
import sys
import fnmatch
import argparse
import pyperclip
import re
# Make sure you have tiktoken installed if you intend to use --calculate-tokens
try:
import tiktoken
TIKTOKEN_AVAILABLE = True
except ImportError:
TIKTOKEN_AVAILABLE = False
PROMPT_TEXT = """
<system prompt>
YOU ARE AN EXPERT SOFTWARE DEVELOPER AND ARCHITECT WITH MASTER-LEVEL COMMAND OF INDUSTRY BEST PRACTICES. YOUR TASK IS TO GENERATE OPTIMIZED, PRODUCTION-READY CODE IN RESPONSE TO THE GIVEN TASK, PRODUCING FINAL CODE LISTINGS WITH NO EXPLANATIONS. ALL RESPONSES MUST ADHERE TO THE FOLLOWING:
<instructions>
- ONLY PROVIDE CODE LISTINGS FOR CHANGED FILES, OMITTING UNCHANGED FILES ENTIRELY.
- ENSURE THE CODE IS ERROR-FREE, HIGHLY OPTIMIZED, AND COMPLIANT WITH THE TASK SPECIFICATIONS.
- FOLLOW ESTABLISHED STANDARDS AND CONVENTIONS FOR THE SPECIFIC LANGUAGE OR FRAMEWORK.
- PROVIDE FULLY FUNCTIONAL CODE SEGMENTS THAT ARE IMMEDIATELY USABLE IN THE CONTEXT OF THE TASK.
<what not to do>
- DO NOT INCLUDE EXPLANATIONS, COMMENTS, OR NON-CODE TEXT IN YOUR RESPONSES.
- DO NOT LIST FILES THAT ARE UNCHANGED OR OUTSIDE THE SCOPE OF THE TASK.
- DO NOT PRODUCE INCOMPLETE OR NON-FUNCTIONAL CODE.
- NEVER DEVIATE FROM THE LANGUAGE, LIBRARY, OR FRAMEWORK SPECIFIED IN THE TASK.
<TASK>
"""
def remove_comments(content: str) -> str:
"""
Removes Python (# ...), C/Java/C++/Swift single-line (// ...),
and block comments (/* ... */) from the given content.
It also removes trailing whitespace and leftover blank lines.
"""
content = re.sub(r'/\*.*?\*/', '', content, flags=re.DOTALL)
content = re.sub(r'//.*', '', content)
content = re.sub(r'#.*', '', content)
content = re.sub(r'[ \t]+$', '', content, flags=re.MULTILINE)
content = re.sub(r'^\s*\n', '', content, flags=re.MULTILINE)
return content
def remove_header_comments(content: str) -> str:
"""
Removes only 'header' comments at the beginning of the file
until the first real (non-comment) line is reached.
"""
lines = content.split('\n')
new_lines = []
removing = True
block_comment_open = False
for line in lines:
stripped = line.strip()
if removing:
if block_comment_open:
if '*/' in stripped:
block_comment_open = False
continue
if stripped.startswith('/*'):
block_comment_open = True
if '*/' in stripped:
block_comment_open = False
continue
if stripped.startswith('//') or stripped.startswith('#') or stripped == '':
continue
removing = False
new_lines.append(line)
else:
new_lines.append(line)
return '\n'.join(new_lines)
def should_exclude(full_relative_path: str, exclude_patterns: list) -> bool:
"""
Returns True if the given path matches any exclusion pattern.
"""
for pattern in exclude_patterns:
if fnmatch.fnmatch(full_relative_path, pattern):
return True
return False
def should_include(full_relative_path: str, include_patterns: list) -> bool:
"""
Returns True if the given path matches at least one inclusion pattern.
"""
if not include_patterns:
return True
for pattern in include_patterns:
if fnmatch.fnmatch(full_relative_path, pattern):
return True
return False
def get_directory_tree(
root_directory: str,
current_path: str,
prefix: str = "",
exclude_list: list = None,
include_list: list = None,
extensions: list = None
) -> str:
"""
Builds an ASCII-style directory tree string for the provided path,
applying exclude/include filters and extension checks.
"""
if exclude_list is None:
exclude_list = []
if include_list is None:
include_list = []
if extensions is None:
extensions = []
if not os.path.isdir(current_path):
return ""
entries = sorted(os.listdir(current_path))
tree_string = ""
valid_children = []
rel_dir = os.path.relpath(current_path, root_directory).replace("\\", "/")
if should_exclude(rel_dir, exclude_list):
return ""
for entry in entries:
full_path = os.path.join(current_path, entry)
rel_path = os.path.relpath(full_path, root_directory).replace("\\", "/")
if should_exclude(rel_path, exclude_list):
continue
if os.path.isdir(full_path):
subtree = get_directory_tree(
root_directory,
full_path,
prefix="",
exclude_list=exclude_list,
include_list=include_list,
extensions=extensions
)
if subtree.strip():
valid_children.append((entry, True, subtree))
else:
if not should_include(rel_path, include_list):
continue
if extensions and not any(entry.endswith(ext) for ext in extensions):
continue
valid_children.append((entry, False, ""))
if not valid_children:
return ""
total = len(valid_children)
for index, (name, is_dir, subtree_str) in enumerate(valid_children):
connector = "├── " if index < total - 1 else "└── "
tree_string += prefix + connector + name + "\n"
if is_dir:
extension_prefix = prefix + ("│ " if index < total - 1 else " ")
subtree_lines = subtree_str.splitlines(True)
for i, subline in enumerate(subtree_lines):
subtree_lines[i] = extension_prefix + subline
tree_string += "".join(subtree_lines)
return tree_string
def format_file_content(relative_path: str, content: str, format_style: str) -> str:
"""
Formats file content according to the specified style (none, md, pretty, xml).
"""
if format_style == "none":
return content + "\n\n"
if format_style == "md":
return f"{relative_path}:\n```\n{content}\n```\n\n"
if format_style == "pretty":
lines = content.splitlines()
header = f"/{relative_path}:\n" + ("-" * 80) + "\n"
numbered_lines = [f"{i} | {line}" for i, line in enumerate(lines, start=1)]
body = "\n".join(numbered_lines) + "\n"
footer = "\n" + ("-" * 80) + "\n\n"
return header + body + footer
if format_style == "xml":
return f"<{relative_path}>\n{content}\n"
return f"{relative_path}:\n```\n{content}\n```\n\n"
def parse_gitignore(gitignore_path: str) -> list:
"""
Parses .gitignore to create a list of patterns.
"""
patterns = []
if not os.path.isfile(gitignore_path):
return patterns
try:
with open(gitignore_path, "r", encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line or line.startswith("#"):
continue
patterns.append(line)
except Exception as e:
print(f"[WARNING] Could not parse .gitignore: {e}")
return patterns
def collect_files(
directory: str,
output_file: str = None,
extensions: list = None,
exclude_list: list = None,
include_list: list = None,
remove_comments_flag: bool = False,
remove_header_comments_flag: bool = False,
add_file_structure_flag: bool = False,
format_style: str = "md",
calculate_tokens_flag: bool = False,
prepend_prompt_flag: bool = False
) -> None:
"""
Collects code from the specified directory, optionally removing comments,
applying exclude/include filters, building an ASCII tree, and copying to clipboard.
"""
if exclude_list is None:
exclude_list = []
if include_list is None:
include_list = []
if extensions is None:
extensions = []
directory = os.path.abspath(directory)
print(f"[INFO] Directory to scan: {directory}")
print(f"[INFO] Excluded patterns: {exclude_list}")
print(f"[INFO] Included patterns: {include_list}")
print(f"[INFO] File extensions to include (empty => all files): {extensions}")
print(f"[INFO] Remove all comments: {remove_comments_flag}")
print(f"[INFO] Remove header comments only: {remove_header_comments_flag}")
print(f"[INFO] Add file structure at start: {add_file_structure_flag}")
print(f"[INFO] Format style: {format_style}")
print(f"[INFO] Calculate tokens: {calculate_tokens_flag}")
print(f"[INFO] Prepend prompt: {prepend_prompt_flag}")
file_structure_content = ""
if add_file_structure_flag:
tree = get_directory_tree(
root_directory=directory,
current_path=directory,
exclude_list=exclude_list,
include_list=include_list,
extensions=extensions
)
if tree.strip():
file_structure_content = tree + "\n"
if output_file:
output_file = os.path.abspath(output_file)
with open(output_file, "w", encoding="utf-8") as f:
f.write("")
print(f"[INFO] Output file {output_file} cleared.")
else:
print("[INFO] No output file specified, results will only be copied to clipboard.")
collected_file_blocks = []
for root, dirs, files in os.walk(directory):
new_dirs = []
for d in dirs:
rel_path_d = os.path.relpath(os.path.join(root, d), directory).replace("\\", "/")
if not should_exclude(rel_path_d, exclude_list):
new_dirs.append(d)
dirs[:] = new_dirs
for file in files:
full_path_f = os.path.join(root, file)
rel_path_f = os.path.relpath(full_path_f, directory).replace("\\", "/")
if should_exclude(rel_path_f, exclude_list):
continue
if not should_include(rel_path_f, include_list):
continue
if extensions and not any(file.endswith(ext) for ext in extensions):
continue
print(f"[INFO] Found file: {rel_path_f}")
try:
with open(full_path_f, "r", encoding="utf-8") as f:
content = f.read()
if remove_header_comments_flag:
content = remove_header_comments(content)
if remove_comments_flag:
content = remove_comments(content)
formatted_block = format_file_content(rel_path_f, content, format_style)
collected_file_blocks.append(formatted_block)
except Exception as e:
print(f"[ERROR] Failed to read file {full_path_f}: {e}")
final_content = file_structure_content + ''.join(collected_file_blocks)
if prepend_prompt_flag:
final_content = PROMPT_TEXT + "\n\n" + final_content
if output_file:
try:
with open(output_file, "a", encoding="utf-8") as out:
out.write(final_content)
print(f"[INFO] Collected file content written to {output_file}.")
except Exception as e:
print(f"[ERROR] Error writing to output file {output_file}: {e}")
pyperclip.copy(final_content)
print("[INFO] Collected file content copied to the clipboard.")
if calculate_tokens_flag:
if not TIKTOKEN_AVAILABLE:
print("[WARNING] tiktoken not installed, cannot calculate tokens.")
else:
try:
enc = tiktoken.encoding_for_model("gpt-4o")
tokens = enc.encode(final_content)
print(f"[INFO] The final content uses {len(tokens)} tokens.")
except Exception as e:
print(f"[WARNING] Failed to calculate token count: {e}")
def main() -> None:
"""
Entry point of the script, setting up argument parsing and orchestrating
the file collection process.
"""
parser = argparse.ArgumentParser(
description="Collect code from files with given extensions, optionally remove comments,"
" exclude or include certain paths, show file structure, and put the result"
" into clipboard (and optionally a file)."
)
parser.add_argument("directory", help="Directory to scan")
parser.add_argument("--output", default=None, help="Output file to save the result.")
parser.add_argument(
"--filetypes",
help="Comma-separated list of file extensions to include (e.g. .swift,.py). If not provided, all filetypes are included.",
default=""
)
parser.add_argument(
"--exclude",
help="Comma-separated list of glob patterns to exclude (files/folders). e.g. **/dist/**,.git,node_modules,*.md",
default=".git,node_modules"
)
parser.add_argument(
"--include",
help="Comma-separated list of glob patterns to specifically include (files). e.g. Package.swift,Project.swift",
default=""
)
parser.add_argument("--remove-comments", action="store_true", help="Remove all comments.")
parser.add_argument("--remove-header-comments", action="store_true", help="Remove only header comments.")
parser.add_argument("--add-file-structure", action="store_true", help="Add an ASCII-style file tree at the start.")
parser.add_argument(
"--format",
choices=["none", "md", "pretty", "xml"],
default="md",
help="Output format: none, md, pretty, or xml. Defaults to md."
)
parser.add_argument(
"--calculate-tokens",
action="store_true",
help="Calculate and display the number of tokens (requires tiktoken)."
)
parser.add_argument(
"--use-gitignore",
action="store_true",
help="If set, parse .gitignore (if present) and apply those patterns to exclude."
)
parser.add_argument(
"--prepend-prompt",
action="store_true",
help="Add the user prompt to the start of the result text."
)
args = parser.parse_args()
directory = args.directory
output_file = args.output
filetypes_str = args.filetypes.strip()
extensions = [ext.strip() for ext in filetypes_str.split(",")] if filetypes_str else []
exclude_list = [e.strip() for e in args.exclude.split(",")] if args.exclude else []
include_list = [i.strip() for i in args.include.split(",")] if args.include else []
if args.use_gitignore:
gitignore_path = os.path.join(directory, ".gitignore")
gitignore_patterns = parse_gitignore(gitignore_path)
if gitignore_patterns:
print("[INFO] .gitignore found and parsed. Adding patterns to exclude_list.")
exclude_list.extend(gitignore_patterns)
else:
print("[INFO] .gitignore not found or empty, proceeding without .gitignore patterns.")
collect_files(
directory=directory,
output_file=output_file,
extensions=extensions,
exclude_list=exclude_list,
include_list=include_list,
remove_comments_flag=args.remove_comments,
remove_header_comments_flag=args.remove_header_comments,
add_file_structure_flag=args.add_file_structure,
format_style=args.format,
calculate_tokens_flag=args.calculate_tokens,
prepend_prompt_flag=args.prepend_prompt
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment