Created
April 22, 2025 17:51
-
-
Save AustinDeric/d24842829b7ec17bf44832681e4bcb3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
gather_code.py – Export a repo’s text source to one file for ChatGPT. | |
Highlights | |
---------- | |
* .gitignore‑aware walk, skips binaries and huge files | |
* Parallel reading with automatic encoding detection | |
* Chunk sizes keyed to the chosen model’s context length | |
* Summary of everything included / skipped | |
""" | |
from __future__ import annotations | |
import os, fnmatch, concurrent.futures | |
# ---------- MODEL / CONTEXT SETTINGS ---------- | |
MODEL = "o4-mini" # default | |
MODEL_CTX_LIMITS = { | |
"o4-mini": 200_000, | |
"o3": 200_000, | |
} | |
SAFETY_MARGIN_TOKENS = 500 # leave space for your prompt | |
CTX_TOKENS = MODEL_CTX_LIMITS.get(MODEL, 200_000) - SAFETY_MARGIN_TOKENS | |
# ------------------------------------------------ | |
GITIGNORE_FILE = ".gitignore" | |
DEFAULT_IGNORED_DIRS = {'.git','__pycache__','.venv','venv'} | |
MAX_FILE_SIZE_BYTES = 1_000_000 # skip >1 MB | |
OUTPUT_FILE = "gather_code_output.txt" | |
EXCLUDED_EXTENSIONS = [".png",".jpg",".jpeg",".gif",".mp4",".mov", | |
".pdf",".exe",".dll",".zip",".tar",".gz"] | |
EXCLUDED_FILENAMES = {"package-lock.json", | |
"LICENSE","LICENSE.txt","license","license.txt"} | |
# ---------------- optional libs ---------------- | |
try: | |
import tiktoken | |
enc = tiktoken.encoding_for_model(MODEL) | |
def n_tokens(txt:str)->int: | |
return len(enc.encode(txt)) | |
def split_by_tokens(txt:str,limit:int): | |
ids = enc.encode(txt) | |
for i in range(0,len(ids),limit): | |
yield enc.decode(ids[i:i+limit]) | |
except Exception: # fallback if no tiktoken | |
enc = None | |
def n_tokens(txt:str)->int: return len(txt.split()) | |
def split_by_tokens(txt:str,limit:int): | |
chars = limit*4 # ~4 chars ≈ 1 token heuristic | |
for i in range(0,len(txt),chars): | |
yield txt[i:i+chars] | |
try: | |
from charset_normalizer import from_bytes as cn_from_bytes | |
def detect_enc(data:bytes)->str: | |
return cn_from_bytes(data).best().encoding or 'utf-8' | |
except Exception: | |
try: | |
import chardet | |
def detect_enc(data:bytes)->str: | |
return chardet.detect(data)['encoding'] or 'utf-8' | |
except Exception: | |
detect_enc = lambda d: "utf-8" | |
# ------------------------------------------------ | |
def load_gitignore() -> list[str]: | |
pats=[] | |
if os.path.exists(GITIGNORE_FILE): | |
with open(GITIGNORE_FILE,encoding="utf-8") as f: | |
pats=[l.strip() for l in f if l.strip() and not l.startswith('#')] | |
return pats | |
def gitignore_match(path:str,pats:list[str])->bool: | |
rel=os.path.relpath(path,'.') | |
for p in pats: | |
if p.startswith('/'): p=p[1:] | |
if p.endswith('/'): | |
d=p.rstrip('/') | |
if rel==d or rel.startswith(d+os.sep): return True | |
if fnmatch.fnmatch(rel,p): return True | |
return False | |
def scan_repo(root='.') -> tuple[list[str],set[str],set[str],set[str]]: | |
pats=load_gitignore() | |
files,skip_dirs,skip_root,skip_large=[],set(),set(),set() | |
for dirpath,dirnames,filenames in os.walk(root,topdown=True): | |
for d in dirnames[:]: | |
full=os.path.join(dirpath,d) | |
if d in DEFAULT_IGNORED_DIRS or gitignore_match(full,pats): | |
skip_dirs.add(os.path.relpath(full,root)) | |
dirnames.remove(d) | |
for fn in filenames: | |
fp=os.path.join(dirpath,fn) | |
# name / ext rules | |
if fn in EXCLUDED_FILENAMES: | |
(skip_root if dirpath==root else skip_dirs).add(fn) | |
continue | |
if gitignore_match(fp,pats): continue | |
if os.path.splitext(fn)[1].lower() in EXCLUDED_EXTENSIONS: | |
(skip_root if dirpath==root else skip_dirs).add(fn) | |
continue | |
# size rule | |
try: | |
if os.path.getsize(fp)>MAX_FILE_SIZE_BYTES: | |
skip_large.add(os.path.relpath(fp,root)) | |
continue | |
except OSError: continue | |
files.append(fp) | |
return files,skip_dirs,skip_root,skip_large | |
def read_file(path:str): | |
try: | |
with open(path,'rb') as fh: raw=fh.read() | |
text=raw.decode(detect_enc(raw),errors='replace') | |
return path,text,len(raw),n_tokens(text) | |
except Exception: | |
return None | |
def main(): | |
files,sk_dirs,sk_root,sk_large = scan_repo('.') | |
stats={"files":0,"bytes":0,"tokens":0} | |
with concurrent.futures.ThreadPoolExecutor() as ex: | |
results = ex.map(read_file,files) | |
with open(OUTPUT_FILE,'w',encoding='utf-8') as out: | |
for res in results: | |
if res is None: continue | |
path,text,b,t = res | |
stats["files"]+=1; stats["bytes"]+=b; stats["tokens"]+=t | |
for chunk in split_by_tokens(text,CTX_TOKENS): | |
out.write(f"\n--- FILE: {path} ---\n```\n{chunk}\n```\n") | |
# summary | |
out.write("\n===== SUMMARY =====\n") | |
out.write(f"Model : {MODEL} (limit {MODEL_CTX_LIMITS.get(MODEL,'?')} tokens)\n") | |
out.write(f"Chunk token limit : {CTX_TOKENS}\n") | |
out.write(f"Included files : {stats['files']}\n") | |
out.write(f"Total size : {stats['bytes']:,} bytes\n") | |
out.write(f"Approx. tokens : {stats['tokens']:,}\n") | |
out.write(f"Size cut‑off : {MAX_FILE_SIZE_BYTES:,} bytes\n") | |
if sk_dirs: | |
out.write("\nExcluded folders:\n") | |
for d in sorted(sk_dirs): out.write(f" - {d}\n") | |
if sk_root: | |
out.write("\nExcluded root‑level files:\n") | |
for f in sorted(sk_root): out.write(f" - {f}\n") | |
if sk_large: | |
out.write("\nExcluded large files:\n") | |
for f in sorted(sk_large): out.write(f" - {f}\n") | |
# mirror to console | |
print("===== SUMMARY =====") | |
print(f"Model / chunk size : {MODEL} / {CTX_TOKENS} tokens") | |
print(f"Included files : {stats['files']}") | |
print(f"Total size : {stats['bytes']:,} bytes") | |
print(f"Approx. tokens : {stats['tokens']:,}") | |
print(f"Size cut‑off : {MAX_FILE_SIZE_BYTES:,} bytes") | |
if sk_dirs: | |
print("\nExcluded folders:"); [print(f" - {d}") for d in sorted(sk_dirs)] | |
if sk_root: | |
print("\nExcluded root‑level files:"); [print(f" - {f}") for f in sorted(sk_root)] | |
if sk_large: | |
print("\nExcluded large files:"); [print(f" - {f}") for f in sorted(sk_large)] | |
print(f"\nWrote output to '{OUTPUT_FILE}'") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment