-
-
Save odurgun/37aa4f2612736cf95f421f6b62f6c72b to your computer and use it in GitHub Desktop.
Ebook Chunk Splitter - python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script: Ebook Chunk Splitter | |
Description: This script takes an input text file of an ebook and splits it into smaller chunks based on a specified character limit. It ensures that paragraphs are not split and creates an output directory with the split chunks of the ebook. | |
Input: Replace `<input_file>` with the path to the input text file of the ebook. | |
Output: Replace `<output_folder>` with the path to the output directory where the split ebook chunks will be stored. | |
""" | |
import os | |
import datetime | |
# Define input and output paths | |
input_file = "/path/to/input/ebook.txt" | |
output_folder = "/path/to/output/" | |
# Set maximum character limit for each output file | |
max_characters = 90000 | |
# Create output folder if it doesn't exist | |
if not os.path.exists(output_folder): | |
os.makedirs(output_folder) | |
# Get the base filename without extension | |
base_filename = os.path.splitext(os.path.basename(input_file))[0] | |
# Generate unique datetime stamp | |
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S") | |
# Create a new folder with the source file name and timestamp | |
output_directory = os.path.join(output_folder, f"{base_filename}_{timestamp}") | |
os.makedirs(output_directory) | |
# Read the input ebook text file | |
with open(input_file, "r") as f: | |
lines = f.readlines() | |
# Split the text into chunks of maximum length without splitting paragraphs | |
chunks = [] | |
current_chunk = "" | |
for line in lines: | |
if len(current_chunk) + len(line) <= max_characters: | |
current_chunk += line | |
else: | |
chunks.append(current_chunk) | |
current_chunk = line | |
chunks.append(current_chunk) | |
# Write each chunk to a separate output file | |
for i, chunk in enumerate(chunks): | |
output_file = os.path.join(output_directory, f"{base_filename}_{i+1}.txt") | |
with open(output_file, "w") as f: | |
f.write(chunk) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment