Skip to content

Instantly share code, notes, and snippets.

@odurgun
Forked from ParkWardRR/EbookChunkSplitter.py
Created February 15, 2025 05:54
Show Gist options
  • Save odurgun/37aa4f2612736cf95f421f6b62f6c72b to your computer and use it in GitHub Desktop.
Save odurgun/37aa4f2612736cf95f421f6b62f6c72b to your computer and use it in GitHub Desktop.
Ebook Chunk Splitter - python
"""
Script: Ebook Chunk Splitter
Description: This script takes an input text file of an ebook and splits it into smaller chunks based on a specified character limit. It ensures that paragraphs are not split and creates an output directory with the split chunks of the ebook.
Input: Replace `<input_file>` with the path to the input text file of the ebook.
Output: Replace `<output_folder>` with the path to the output directory where the split ebook chunks will be stored.
"""
import os
import datetime
# Define input and output paths
input_file = "/path/to/input/ebook.txt"
output_folder = "/path/to/output/"
# Set maximum character limit for each output file
max_characters = 90000
# Create output folder if it doesn't exist
if not os.path.exists(output_folder):
os.makedirs(output_folder)
# Get the base filename without extension
base_filename = os.path.splitext(os.path.basename(input_file))[0]
# Generate unique datetime stamp
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
# Create a new folder with the source file name and timestamp
output_directory = os.path.join(output_folder, f"{base_filename}_{timestamp}")
os.makedirs(output_directory)
# Read the input ebook text file
with open(input_file, "r") as f:
lines = f.readlines()
# Split the text into chunks of maximum length without splitting paragraphs
chunks = []
current_chunk = ""
for line in lines:
if len(current_chunk) + len(line) <= max_characters:
current_chunk += line
else:
chunks.append(current_chunk)
current_chunk = line
chunks.append(current_chunk)
# Write each chunk to a separate output file
for i, chunk in enumerate(chunks):
output_file = os.path.join(output_directory, f"{base_filename}_{i+1}.txt")
with open(output_file, "w") as f:
f.write(chunk)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment