AustinDeric · April 22, 2025 17:51
diff --git a/gather_code.py b/gather_code.py
 #!/usr/bin/env python3
 """
 gather_code.py  –  Export a repo’s text source to one file for ChatGPT.

 Highlights
 ----------
 * .gitignore‑aware walk, skips binaries and huge files
 * Parallel reading with automatic encoding detection
 * Chunk sizes keyed to the chosen model’s context length
 * Summary of everything included / skipped
 """
 from __future__ import annotations
 import os, fnmatch, concurrent.futures

 # ---------- MODEL / CONTEXT SETTINGS ----------
 MODEL = "o4-mini"                     # default
 MODEL_CTX_LIMITS = {
    "o4-mini": 200_000,
    "o3":      200_000,
 }
 SAFETY_MARGIN_TOKENS = 500            # leave space for your prompt
 CTX_TOKENS = MODEL_CTX_LIMITS.get(MODEL, 200_000) - SAFETY_MARGIN_TOKENS
 # ------------------------------------------------


 GITIGNORE_FILE           = ".gitignore"
 DEFAULT_IGNORED_DIRS     = {'.git','__pycache__','.venv','venv'}
 MAX_FILE_SIZE_BYTES      = 1_000_000           # skip >1 MB
 OUTPUT_FILE              = "gather_code_output.txt"

 EXCLUDED_EXTENSIONS = [".png",".jpg",".jpeg",".gif",".mp4",".mov",
                       ".pdf",".exe",".dll",".zip",".tar",".gz"]
 EXCLUDED_FILENAMES  = {"package-lock.json",
                       "LICENSE","LICENSE.txt","license","license.txt"}

 # ---------------- optional libs ----------------
 try:
    import tiktoken
    enc = tiktoken.encoding_for_model(MODEL)
    def n_tokens(txt:str)->int:
        return len(enc.encode(txt))
    def split_by_tokens(txt:str,limit:int):
        ids = enc.encode(txt)
        for i in range(0,len(ids),limit):
            yield enc.decode(ids[i:i+limit])
 except Exception:                          # fallback if no tiktoken
    enc = None
    def n_tokens(txt:str)->int: return len(txt.split())
    def split_by_tokens(txt:str,limit:int):
        chars = limit*4                    # ~4 chars ≈ 1 token heuristic
        for i in range(0,len(txt),chars):
            yield txt[i:i+chars]

 try:
    from charset_normalizer import from_bytes as cn_from_bytes
    def detect_enc(data:bytes)->str:
        return cn_from_bytes(data).best().encoding or 'utf-8'
 except Exception:
    try:
        import chardet
        def detect_enc(data:bytes)->str:
            return chardet.detect(data)['encoding'] or 'utf-8'
    except Exception:
        detect_enc = lambda d: "utf-8"
 # ------------------------------------------------

 def load_gitignore() -> list[str]:
    pats=[]
    if os.path.exists(GITIGNORE_FILE):
        with open(GITIGNORE_FILE,encoding="utf-8") as f:
            pats=[l.strip() for l in f if l.strip() and not l.startswith('#')]
    return pats

 def gitignore_match(path:str,pats:list[str])->bool:
    rel=os.path.relpath(path,'.')
    for p in pats:
        if p.startswith('/'): p=p[1:]
        if p.endswith('/'):
            d=p.rstrip('/')
            if rel==d or rel.startswith(d+os.sep): return True
        if fnmatch.fnmatch(rel,p): return True
    return False

 def scan_repo(root='.') -> tuple[list[str],set[str],set[str],set[str]]:
    pats=load_gitignore()
    files,skip_dirs,skip_root,skip_large=[],set(),set(),set()
    for dirpath,dirnames,filenames in os.walk(root,topdown=True):
        for d in dirnames[:]:
            full=os.path.join(dirpath,d)
            if d in DEFAULT_IGNORED_DIRS or gitignore_match(full,pats):
                skip_dirs.add(os.path.relpath(full,root))
                dirnames.remove(d)
        for fn in filenames:
            fp=os.path.join(dirpath,fn)
            # name / ext rules
            if fn in EXCLUDED_FILENAMES:
                (skip_root if dirpath==root else skip_dirs).add(fn)
                continue
            if gitignore_match(fp,pats): continue
            if os.path.splitext(fn)[1].lower() in EXCLUDED_EXTENSIONS:
                (skip_root if dirpath==root else skip_dirs).add(fn)
                continue
            # size rule
            try:
                if os.path.getsize(fp)>MAX_FILE_SIZE_BYTES:
                    skip_large.add(os.path.relpath(fp,root))
                    continue
            except OSError: continue
            files.append(fp)
    return files,skip_dirs,skip_root,skip_large

 def read_file(path:str):
    try:
        with open(path,'rb') as fh: raw=fh.read()
        text=raw.decode(detect_enc(raw),errors='replace')
        return path,text,len(raw),n_tokens(text)
    except Exception:
        return None

 def main():
    files,sk_dirs,sk_root,sk_large = scan_repo('.')
    stats={"files":0,"bytes":0,"tokens":0}

    with concurrent.futures.ThreadPoolExecutor() as ex:
        results = ex.map(read_file,files)

    with open(OUTPUT_FILE,'w',encoding='utf-8') as out:
        for res in results:
            if res is None: continue
            path,text,b,t = res
            stats["files"]+=1; stats["bytes"]+=b; stats["tokens"]+=t
            for chunk in split_by_tokens(text,CTX_TOKENS):
                out.write(f"\n--- FILE: {path} ---\n```\n{chunk}\n```\n")

        # summary
        out.write("\n===== SUMMARY =====\n")
        out.write(f"Model              : {MODEL} (limit {MODEL_CTX_LIMITS.get(MODEL,'?')} tokens)\n")
        out.write(f"Chunk token limit  : {CTX_TOKENS}\n")
        out.write(f"Included files     : {stats['files']}\n")
        out.write(f"Total size         : {stats['bytes']:,} bytes\n")
        out.write(f"Approx. tokens     : {stats['tokens']:,}\n")
        out.write(f"Size cut‑off       : {MAX_FILE_SIZE_BYTES:,} bytes\n")
        if sk_dirs:
            out.write("\nExcluded folders:\n")
            for d in sorted(sk_dirs): out.write(f"  - {d}\n")
        if sk_root:
            out.write("\nExcluded root‑level files:\n")
            for f in sorted(sk_root): out.write(f"  - {f}\n")
        if sk_large:
            out.write("\nExcluded large files:\n")
            for f in sorted(sk_large): out.write(f"  - {f}\n")

    # mirror to console
    print("===== SUMMARY =====")
    print(f"Model / chunk size : {MODEL} / {CTX_TOKENS} tokens")
    print(f"Included files     : {stats['files']}")
    print(f"Total size         : {stats['bytes']:,} bytes")
    print(f"Approx. tokens     : {stats['tokens']:,}")
    print(f"Size cut‑off       : {MAX_FILE_SIZE_BYTES:,} bytes")
    if sk_dirs:
        print("\nExcluded folders:");      [print(f"  - {d}") for d in sorted(sk_dirs)]
    if sk_root:
        print("\nExcluded root‑level files:"); [print(f"  - {f}") for f in sorted(sk_root)]
    if sk_large:
        print("\nExcluded large files:");  [print(f"  - {f}") for f in sorted(sk_large)]
    print(f"\nWrote output to '{OUTPUT_FILE}'")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	gather_code.py – Export a repo’s text source to one file for ChatGPT.

	Highlights
	----------
	* .gitignore‑aware walk, skips binaries and huge files
	* Parallel reading with automatic encoding detection
	* Chunk sizes keyed to the chosen model’s context length
	* Summary of everything included / skipped
	"""
	from __future__ import annotations
	import os, fnmatch, concurrent.futures

	# ---------- MODEL / CONTEXT SETTINGS ----------
	MODEL = "o4-mini" # default
	MODEL_CTX_LIMITS = {
	"o4-mini": 200_000,
	"o3": 200_000,
	}
	SAFETY_MARGIN_TOKENS = 500 # leave space for your prompt
	CTX_TOKENS = MODEL_CTX_LIMITS.get(MODEL, 200_000) - SAFETY_MARGIN_TOKENS
	# ------------------------------------------------


	GITIGNORE_FILE = ".gitignore"
	DEFAULT_IGNORED_DIRS = {'.git','__pycache__','.venv','venv'}
	MAX_FILE_SIZE_BYTES = 1_000_000 # skip >1 MB
	OUTPUT_FILE = "gather_code_output.txt"

	EXCLUDED_EXTENSIONS = [".png",".jpg",".jpeg",".gif",".mp4",".mov",
	".pdf",".exe",".dll",".zip",".tar",".gz"]
	EXCLUDED_FILENAMES = {"package-lock.json",
	"LICENSE","LICENSE.txt","license","license.txt"}

	# ---------------- optional libs ----------------
	try:
	import tiktoken
	enc = tiktoken.encoding_for_model(MODEL)
	def n_tokens(txt:str)->int:
	return len(enc.encode(txt))
	def split_by_tokens(txt:str,limit:int):
	ids = enc.encode(txt)
	for i in range(0,len(ids),limit):
	yield enc.decode(ids[i:i+limit])
	except Exception: # fallback if no tiktoken
	enc = None
	def n_tokens(txt:str)->int: return len(txt.split())
	def split_by_tokens(txt:str,limit:int):
	chars = limit*4 # ~4 chars ≈ 1 token heuristic
	for i in range(0,len(txt),chars):
	yield txt[i:i+chars]

	try:
	from charset_normalizer import from_bytes as cn_from_bytes
	def detect_enc(data:bytes)->str:
	return cn_from_bytes(data).best().encoding or 'utf-8'
	except Exception:
	try:
	import chardet
	def detect_enc(data:bytes)->str:
	return chardet.detect(data)['encoding'] or 'utf-8'
	except Exception:
	detect_enc = lambda d: "utf-8"
	# ------------------------------------------------

	def load_gitignore() -> list[str]:
	pats=[]
	if os.path.exists(GITIGNORE_FILE):
	with open(GITIGNORE_FILE,encoding="utf-8") as f:
	pats=[l.strip() for l in f if l.strip() and not l.startswith('#')]
	return pats

	def gitignore_match(path:str,pats:list[str])->bool:
	rel=os.path.relpath(path,'.')
	for p in pats:
	if p.startswith('/'): p=p[1:]
	if p.endswith('/'):
	d=p.rstrip('/')
	if rel==d or rel.startswith(d+os.sep): return True
	if fnmatch.fnmatch(rel,p): return True
	return False

	def scan_repo(root='.') -> tuple[list[str],set[str],set[str],set[str]]:
	pats=load_gitignore()
	files,skip_dirs,skip_root,skip_large=[],set(),set(),set()
	for dirpath,dirnames,filenames in os.walk(root,topdown=True):
	for d in dirnames[:]:
	full=os.path.join(dirpath,d)
	if d in DEFAULT_IGNORED_DIRS or gitignore_match(full,pats):
	skip_dirs.add(os.path.relpath(full,root))
	dirnames.remove(d)
	for fn in filenames:
	fp=os.path.join(dirpath,fn)
	# name / ext rules
	if fn in EXCLUDED_FILENAMES:
	(skip_root if dirpath==root else skip_dirs).add(fn)
	continue
	if gitignore_match(fp,pats): continue
	if os.path.splitext(fn)[1].lower() in EXCLUDED_EXTENSIONS:
	(skip_root if dirpath==root else skip_dirs).add(fn)
	continue
	# size rule
	try:
	if os.path.getsize(fp)>MAX_FILE_SIZE_BYTES:
	skip_large.add(os.path.relpath(fp,root))
	continue
	except OSError: continue
	files.append(fp)
	return files,skip_dirs,skip_root,skip_large

	def read_file(path:str):
	try:
	with open(path,'rb') as fh: raw=fh.read()
	text=raw.decode(detect_enc(raw),errors='replace')
	return path,text,len(raw),n_tokens(text)
	except Exception:
	return None

	def main():
	files,sk_dirs,sk_root,sk_large = scan_repo('.')
	stats={"files":0,"bytes":0,"tokens":0}

	with concurrent.futures.ThreadPoolExecutor() as ex:
	results = ex.map(read_file,files)

	with open(OUTPUT_FILE,'w',encoding='utf-8') as out:
	for res in results:
	if res is None: continue
	path,text,b,t = res
	stats["files"]+=1; stats["bytes"]+=b; stats["tokens"]+=t
	for chunk in split_by_tokens(text,CTX_TOKENS):
	out.write(f"\n--- FILE: {path} ---\n```\n{chunk}\n```\n")

	# summary
	out.write("\n===== SUMMARY =====\n")
	out.write(f"Model : {MODEL} (limit {MODEL_CTX_LIMITS.get(MODEL,'?')} tokens)\n")
	out.write(f"Chunk token limit : {CTX_TOKENS}\n")
	out.write(f"Included files : {stats['files']}\n")
	out.write(f"Total size : {stats['bytes']:,} bytes\n")
	out.write(f"Approx. tokens : {stats['tokens']:,}\n")
	out.write(f"Size cut‑off : {MAX_FILE_SIZE_BYTES:,} bytes\n")
	if sk_dirs:
	out.write("\nExcluded folders:\n")
	for d in sorted(sk_dirs): out.write(f" - {d}\n")
	if sk_root:
	out.write("\nExcluded root‑level files:\n")
	for f in sorted(sk_root): out.write(f" - {f}\n")
	if sk_large:
	out.write("\nExcluded large files:\n")
	for f in sorted(sk_large): out.write(f" - {f}\n")

	# mirror to console
	print("===== SUMMARY =====")
	print(f"Model / chunk size : {MODEL} / {CTX_TOKENS} tokens")
	print(f"Included files : {stats['files']}")
	print(f"Total size : {stats['bytes']:,} bytes")
	print(f"Approx. tokens : {stats['tokens']:,}")
	print(f"Size cut‑off : {MAX_FILE_SIZE_BYTES:,} bytes")
	if sk_dirs:
	print("\nExcluded folders:"); [print(f" - {d}") for d in sorted(sk_dirs)]
	if sk_root:
	print("\nExcluded root‑level files:"); [print(f" - {f}") for f in sorted(sk_root)]
	if sk_large:
	print("\nExcluded large files:"); [print(f" - {f}") for f in sorted(sk_large)]
	print(f"\nWrote output to '{OUTPUT_FILE}'")

	if __name__ == "__main__":
	main()