Skip to content

Instantly share code, notes, and snippets.

@Mapagmataas1331
Last active August 23, 2024 19:02
Show Gist options
  • Save Mapagmataas1331/86f4cf9f2ad1f9dfd60e85f00ba4bb88 to your computer and use it in GitHub Desktop.
Save Mapagmataas1331/86f4cf9f2ad1f9dfd60e85f00ba4bb88 to your computer and use it in GitHub Desktop.
This script scans directories, visualizes their structure, reads file contents, and splits them based on GPT model token limits. It saves the processed output in a text file, making it ideal for preparing data for language models.
import os
import tiktoken
GPT_MODELS = {
# "gpt-3.5-turbo": 4096,
# "gpt-3.5-turbo-16k": 16384,
# "gpt-4o": 4096,
"gpt-4o": 4000,
# "gpt-4": 8192,
# "gpt-4-32k": 32768,
# "GPT-4-turbo": 128000
}
def scan_files_recursive(directory):
"""Recursively scan files in the directory."""
file_paths = []
for root, _, files in os.walk(directory):
for file in files:
file_paths.append(os.path.join(root, file))
return file_paths
def print_directory_tree(base_path, included_paths):
"""Print a tree-like structure for files in the directory."""
tree = {}
for path in included_paths:
relative_path = os.path.relpath(path, base_path)
parts = relative_path.split(os.sep)
node = tree
for part in parts:
if part not in node:
node[part] = {}
node = node[part]
def print_tree(node, indent=""):
for key, value in node.items():
if value:
print(f"{indent}{key}/")
print_tree(value, indent + " ")
else:
print(f"{indent}{key}")
print_tree(tree)
def read_file_content(file_path):
"""Read and return the content of a file."""
try:
with open(file_path, 'r', encoding='utf-8') as file:
return file.read()
except Exception as e:
print(f"Error reading {file_path}: {e}")
return None
def split_content(content, max_tokens, model):
"""Split content into parts according to token limits."""
encoding = tiktoken.encoding_for_model(model)
tokens = encoding.encode(content)
parts = []
start = 0
while start < len(tokens):
end = start + max_tokens
if end >= len(tokens):
part = encoding.decode(tokens[start:])
parts.append(part)
break
split_point = end
for i in range(end, start, -1):
if encoding.decode(tokens[i - 1:i]) == "\n":
split_point = i
break
if split_point == start:
split_point = end
part = encoding.decode(tokens[start:split_point])
parts.append(part)
start = split_point
return parts
def save_output(output_path, global_path, output, total_files):
"""Save output to a text file, including the global path and total files at the top."""
try:
with open(output_path, 'w', encoding='utf-8') as file:
file.write(f"Global Project Path: {global_path}\n")
file.write(f"Total Files: {total_files}\n\n")
file.write(output)
print(f"Output saved to {output_path}")
except Exception as e:
print(f"Error saving output: {e}")
def normalize_path(path):
"""Normalize the case of the path for consistent comparison."""
return os.path.normcase(os.path.normpath(path))
def main():
print("\nChoose an option:")
print("1. Scan and print file paths.")
print("2. Scan and print file contents (no splitting).")
print("3. Scan and print file contents (split by GPT token limits).")
option = input("\nEnter 1, 2, or 3: ").strip()
if option not in {"1", "2", "3"}:
print("\nInvalid option.\n")
return
if option == "3":
print("\nAvailable GPT Models:")
for idx, (model_name, token_limit) in enumerate(GPT_MODELS.items(), 1):
print(f"{idx}. {model_name} (Token limit: {token_limit})")
model_choice = input("\nSelect a model by number: ").strip()
try:
model_choice = int(model_choice)
if 1 <= model_choice <= len(GPT_MODELS):
selected_model = list(GPT_MODELS.keys())[model_choice - 1]
max_tokens = GPT_MODELS[selected_model]
print(f"Selected model: {selected_model} (Token limit: {max_tokens})")
else:
print("\nInvalid selection.\n")
return
except ValueError:
print("\nInvalid input.\n")
return
global_path = input("\nEnter the global project path: ").strip()
if not os.path.exists(global_path):
print("\nThe provided path does not exist.\n")
return
included_paths = []
while True:
print("\nIncluded paths:")
print_directory_tree(global_path, included_paths)
user_input = input(
"\nEnter more files/folders to include/remove, type '.' for all, or type 'STOP' to finish: ").strip()
if user_input.lower() == "stop":
break
user_input_path = normalize_path(os.path.join(global_path, user_input))
if os.path.exists(user_input_path):
if os.path.isdir(user_input_path):
file_paths = [normalize_path(
path) for path in scan_files_recursive(user_input_path)]
else:
file_paths = [user_input_path]
output_file_path = normalize_path(
os.path.join(global_path, "output.txt"))
file_paths = [path for path in file_paths if path != output_file_path]
if all(path in included_paths for path in file_paths):
for path in file_paths:
included_paths.remove(path)
print(f"Removed '{user_input}' from the list.")
else:
for path in file_paths:
if path not in included_paths:
included_paths.append(path)
print(f"Added '{user_input}' to the list.")
else:
print(f"Path '{user_input_path}' does not exist.")
if not included_paths:
print("No files selected.")
return
output_content = ""
if option == "1":
for file_path in included_paths:
relative_path = os.path.relpath(file_path, global_path)
output_content += relative_path + "\n"
else:
for file_path in included_paths:
content = read_file_content(file_path)
if content is None:
continue
relative_path = os.path.relpath(file_path, global_path)
if option == "2":
output_content += f"Path: {relative_path}\nContent:\n{
content}\n\n-=-=-=-=-=-=-=-=-=-=-=\n\n"
elif option == "3":
parts = split_content(content, max_tokens, selected_model)
if len(parts) == 1:
output_content += f"Path: {relative_path}\nContent:\n{
parts[0]}\n\n-=-=-=-=-=-=-=-=-=-=-=\n\n"
else:
for idx, part in enumerate(parts):
output_content += f"Path: {relative_path} (Part {idx + 1})\nContent:\n{
part}\n\n-=-=-=-=-=-=-=-=-=-=-=\n\n"
total_files = len(included_paths)
output_path = os.path.join(global_path, "output.txt")
save_output(output_path, global_path, output_content, total_files)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment