Last active
August 23, 2024 19:02
-
-
Save Mapagmataas1331/86f4cf9f2ad1f9dfd60e85f00ba4bb88 to your computer and use it in GitHub Desktop.
This script scans directories, visualizes their structure, reads file contents, and splits them based on GPT model token limits. It saves the processed output in a text file, making it ideal for preparing data for language models.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import tiktoken | |
GPT_MODELS = { | |
# "gpt-3.5-turbo": 4096, | |
# "gpt-3.5-turbo-16k": 16384, | |
# "gpt-4o": 4096, | |
"gpt-4o": 4000, | |
# "gpt-4": 8192, | |
# "gpt-4-32k": 32768, | |
# "GPT-4-turbo": 128000 | |
} | |
def scan_files_recursive(directory): | |
"""Recursively scan files in the directory.""" | |
file_paths = [] | |
for root, _, files in os.walk(directory): | |
for file in files: | |
file_paths.append(os.path.join(root, file)) | |
return file_paths | |
def print_directory_tree(base_path, included_paths): | |
"""Print a tree-like structure for files in the directory.""" | |
tree = {} | |
for path in included_paths: | |
relative_path = os.path.relpath(path, base_path) | |
parts = relative_path.split(os.sep) | |
node = tree | |
for part in parts: | |
if part not in node: | |
node[part] = {} | |
node = node[part] | |
def print_tree(node, indent=""): | |
for key, value in node.items(): | |
if value: | |
print(f"{indent}{key}/") | |
print_tree(value, indent + " ") | |
else: | |
print(f"{indent}{key}") | |
print_tree(tree) | |
def read_file_content(file_path): | |
"""Read and return the content of a file.""" | |
try: | |
with open(file_path, 'r', encoding='utf-8') as file: | |
return file.read() | |
except Exception as e: | |
print(f"Error reading {file_path}: {e}") | |
return None | |
def split_content(content, max_tokens, model): | |
"""Split content into parts according to token limits.""" | |
encoding = tiktoken.encoding_for_model(model) | |
tokens = encoding.encode(content) | |
parts = [] | |
start = 0 | |
while start < len(tokens): | |
end = start + max_tokens | |
if end >= len(tokens): | |
part = encoding.decode(tokens[start:]) | |
parts.append(part) | |
break | |
split_point = end | |
for i in range(end, start, -1): | |
if encoding.decode(tokens[i - 1:i]) == "\n": | |
split_point = i | |
break | |
if split_point == start: | |
split_point = end | |
part = encoding.decode(tokens[start:split_point]) | |
parts.append(part) | |
start = split_point | |
return parts | |
def save_output(output_path, global_path, output, total_files): | |
"""Save output to a text file, including the global path and total files at the top.""" | |
try: | |
with open(output_path, 'w', encoding='utf-8') as file: | |
file.write(f"Global Project Path: {global_path}\n") | |
file.write(f"Total Files: {total_files}\n\n") | |
file.write(output) | |
print(f"Output saved to {output_path}") | |
except Exception as e: | |
print(f"Error saving output: {e}") | |
def normalize_path(path): | |
"""Normalize the case of the path for consistent comparison.""" | |
return os.path.normcase(os.path.normpath(path)) | |
def main(): | |
print("\nChoose an option:") | |
print("1. Scan and print file paths.") | |
print("2. Scan and print file contents (no splitting).") | |
print("3. Scan and print file contents (split by GPT token limits).") | |
option = input("\nEnter 1, 2, or 3: ").strip() | |
if option not in {"1", "2", "3"}: | |
print("\nInvalid option.\n") | |
return | |
if option == "3": | |
print("\nAvailable GPT Models:") | |
for idx, (model_name, token_limit) in enumerate(GPT_MODELS.items(), 1): | |
print(f"{idx}. {model_name} (Token limit: {token_limit})") | |
model_choice = input("\nSelect a model by number: ").strip() | |
try: | |
model_choice = int(model_choice) | |
if 1 <= model_choice <= len(GPT_MODELS): | |
selected_model = list(GPT_MODELS.keys())[model_choice - 1] | |
max_tokens = GPT_MODELS[selected_model] | |
print(f"Selected model: {selected_model} (Token limit: {max_tokens})") | |
else: | |
print("\nInvalid selection.\n") | |
return | |
except ValueError: | |
print("\nInvalid input.\n") | |
return | |
global_path = input("\nEnter the global project path: ").strip() | |
if not os.path.exists(global_path): | |
print("\nThe provided path does not exist.\n") | |
return | |
included_paths = [] | |
while True: | |
print("\nIncluded paths:") | |
print_directory_tree(global_path, included_paths) | |
user_input = input( | |
"\nEnter more files/folders to include/remove, type '.' for all, or type 'STOP' to finish: ").strip() | |
if user_input.lower() == "stop": | |
break | |
user_input_path = normalize_path(os.path.join(global_path, user_input)) | |
if os.path.exists(user_input_path): | |
if os.path.isdir(user_input_path): | |
file_paths = [normalize_path( | |
path) for path in scan_files_recursive(user_input_path)] | |
else: | |
file_paths = [user_input_path] | |
output_file_path = normalize_path( | |
os.path.join(global_path, "output.txt")) | |
file_paths = [path for path in file_paths if path != output_file_path] | |
if all(path in included_paths for path in file_paths): | |
for path in file_paths: | |
included_paths.remove(path) | |
print(f"Removed '{user_input}' from the list.") | |
else: | |
for path in file_paths: | |
if path not in included_paths: | |
included_paths.append(path) | |
print(f"Added '{user_input}' to the list.") | |
else: | |
print(f"Path '{user_input_path}' does not exist.") | |
if not included_paths: | |
print("No files selected.") | |
return | |
output_content = "" | |
if option == "1": | |
for file_path in included_paths: | |
relative_path = os.path.relpath(file_path, global_path) | |
output_content += relative_path + "\n" | |
else: | |
for file_path in included_paths: | |
content = read_file_content(file_path) | |
if content is None: | |
continue | |
relative_path = os.path.relpath(file_path, global_path) | |
if option == "2": | |
output_content += f"Path: {relative_path}\nContent:\n{ | |
content}\n\n-=-=-=-=-=-=-=-=-=-=-=\n\n" | |
elif option == "3": | |
parts = split_content(content, max_tokens, selected_model) | |
if len(parts) == 1: | |
output_content += f"Path: {relative_path}\nContent:\n{ | |
parts[0]}\n\n-=-=-=-=-=-=-=-=-=-=-=\n\n" | |
else: | |
for idx, part in enumerate(parts): | |
output_content += f"Path: {relative_path} (Part {idx + 1})\nContent:\n{ | |
part}\n\n-=-=-=-=-=-=-=-=-=-=-=\n\n" | |
total_files = len(included_paths) | |
output_path = os.path.join(global_path, "output.txt") | |
save_output(output_path, global_path, output_content, total_files) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment