Skip to content

Instantly share code, notes, and snippets.

@AustinDeric
Created April 22, 2025 17:51
Show Gist options
  • Save AustinDeric/d24842829b7ec17bf44832681e4bcb3a to your computer and use it in GitHub Desktop.
Save AustinDeric/d24842829b7ec17bf44832681e4bcb3a to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
gather_code.py – Export a repo’s text source to one file for ChatGPT.
Highlights
----------
* .gitignore‑aware walk, skips binaries and huge files
* Parallel reading with automatic encoding detection
* Chunk sizes keyed to the chosen model’s context length
* Summary of everything included / skipped
"""
from __future__ import annotations
import os, fnmatch, concurrent.futures
# ---------- MODEL / CONTEXT SETTINGS ----------
MODEL = "o4-mini" # default
MODEL_CTX_LIMITS = {
"o4-mini": 200_000,
"o3": 200_000,
}
SAFETY_MARGIN_TOKENS = 500 # leave space for your prompt
CTX_TOKENS = MODEL_CTX_LIMITS.get(MODEL, 200_000) - SAFETY_MARGIN_TOKENS
# ------------------------------------------------
GITIGNORE_FILE = ".gitignore"
DEFAULT_IGNORED_DIRS = {'.git','__pycache__','.venv','venv'}
MAX_FILE_SIZE_BYTES = 1_000_000 # skip >1 MB
OUTPUT_FILE = "gather_code_output.txt"
EXCLUDED_EXTENSIONS = [".png",".jpg",".jpeg",".gif",".mp4",".mov",
".pdf",".exe",".dll",".zip",".tar",".gz"]
EXCLUDED_FILENAMES = {"package-lock.json",
"LICENSE","LICENSE.txt","license","license.txt"}
# ---------------- optional libs ----------------
try:
import tiktoken
enc = tiktoken.encoding_for_model(MODEL)
def n_tokens(txt:str)->int:
return len(enc.encode(txt))
def split_by_tokens(txt:str,limit:int):
ids = enc.encode(txt)
for i in range(0,len(ids),limit):
yield enc.decode(ids[i:i+limit])
except Exception: # fallback if no tiktoken
enc = None
def n_tokens(txt:str)->int: return len(txt.split())
def split_by_tokens(txt:str,limit:int):
chars = limit*4 # ~4 chars ≈ 1 token heuristic
for i in range(0,len(txt),chars):
yield txt[i:i+chars]
try:
from charset_normalizer import from_bytes as cn_from_bytes
def detect_enc(data:bytes)->str:
return cn_from_bytes(data).best().encoding or 'utf-8'
except Exception:
try:
import chardet
def detect_enc(data:bytes)->str:
return chardet.detect(data)['encoding'] or 'utf-8'
except Exception:
detect_enc = lambda d: "utf-8"
# ------------------------------------------------
def load_gitignore() -> list[str]:
pats=[]
if os.path.exists(GITIGNORE_FILE):
with open(GITIGNORE_FILE,encoding="utf-8") as f:
pats=[l.strip() for l in f if l.strip() and not l.startswith('#')]
return pats
def gitignore_match(path:str,pats:list[str])->bool:
rel=os.path.relpath(path,'.')
for p in pats:
if p.startswith('/'): p=p[1:]
if p.endswith('/'):
d=p.rstrip('/')
if rel==d or rel.startswith(d+os.sep): return True
if fnmatch.fnmatch(rel,p): return True
return False
def scan_repo(root='.') -> tuple[list[str],set[str],set[str],set[str]]:
pats=load_gitignore()
files,skip_dirs,skip_root,skip_large=[],set(),set(),set()
for dirpath,dirnames,filenames in os.walk(root,topdown=True):
for d in dirnames[:]:
full=os.path.join(dirpath,d)
if d in DEFAULT_IGNORED_DIRS or gitignore_match(full,pats):
skip_dirs.add(os.path.relpath(full,root))
dirnames.remove(d)
for fn in filenames:
fp=os.path.join(dirpath,fn)
# name / ext rules
if fn in EXCLUDED_FILENAMES:
(skip_root if dirpath==root else skip_dirs).add(fn)
continue
if gitignore_match(fp,pats): continue
if os.path.splitext(fn)[1].lower() in EXCLUDED_EXTENSIONS:
(skip_root if dirpath==root else skip_dirs).add(fn)
continue
# size rule
try:
if os.path.getsize(fp)>MAX_FILE_SIZE_BYTES:
skip_large.add(os.path.relpath(fp,root))
continue
except OSError: continue
files.append(fp)
return files,skip_dirs,skip_root,skip_large
def read_file(path:str):
try:
with open(path,'rb') as fh: raw=fh.read()
text=raw.decode(detect_enc(raw),errors='replace')
return path,text,len(raw),n_tokens(text)
except Exception:
return None
def main():
files,sk_dirs,sk_root,sk_large = scan_repo('.')
stats={"files":0,"bytes":0,"tokens":0}
with concurrent.futures.ThreadPoolExecutor() as ex:
results = ex.map(read_file,files)
with open(OUTPUT_FILE,'w',encoding='utf-8') as out:
for res in results:
if res is None: continue
path,text,b,t = res
stats["files"]+=1; stats["bytes"]+=b; stats["tokens"]+=t
for chunk in split_by_tokens(text,CTX_TOKENS):
out.write(f"\n--- FILE: {path} ---\n```\n{chunk}\n```\n")
# summary
out.write("\n===== SUMMARY =====\n")
out.write(f"Model : {MODEL} (limit {MODEL_CTX_LIMITS.get(MODEL,'?')} tokens)\n")
out.write(f"Chunk token limit : {CTX_TOKENS}\n")
out.write(f"Included files : {stats['files']}\n")
out.write(f"Total size : {stats['bytes']:,} bytes\n")
out.write(f"Approx. tokens : {stats['tokens']:,}\n")
out.write(f"Size cut‑off : {MAX_FILE_SIZE_BYTES:,} bytes\n")
if sk_dirs:
out.write("\nExcluded folders:\n")
for d in sorted(sk_dirs): out.write(f" - {d}\n")
if sk_root:
out.write("\nExcluded root‑level files:\n")
for f in sorted(sk_root): out.write(f" - {f}\n")
if sk_large:
out.write("\nExcluded large files:\n")
for f in sorted(sk_large): out.write(f" - {f}\n")
# mirror to console
print("===== SUMMARY =====")
print(f"Model / chunk size : {MODEL} / {CTX_TOKENS} tokens")
print(f"Included files : {stats['files']}")
print(f"Total size : {stats['bytes']:,} bytes")
print(f"Approx. tokens : {stats['tokens']:,}")
print(f"Size cut‑off : {MAX_FILE_SIZE_BYTES:,} bytes")
if sk_dirs:
print("\nExcluded folders:"); [print(f" - {d}") for d in sorted(sk_dirs)]
if sk_root:
print("\nExcluded root‑level files:"); [print(f" - {f}") for f in sorted(sk_root)]
if sk_large:
print("\nExcluded large files:"); [print(f" - {f}") for f in sorted(sk_large)]
print(f"\nWrote output to '{OUTPUT_FILE}'")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment