Forked from szarroug3/split_audiobook_chapters.py
Last active
November 24, 2022 07:17
-
-
Save rbreaves/9fcd0b7f6c9c9f116ad5cb3a2f94b650 to your computer and use it in GitHub Desktop.
Split audiobook into chapters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Script to split audiobook chapters into separate files using metadata | |
""" | |
from __future__ import print_function | |
import os | |
import re | |
import string | |
import codecs | |
import subprocess | |
from xml.etree import ElementTree | |
from argparse import ArgumentParser | |
ALLOWED_FILETYPES = ['.mp3'] | |
CHAPTER = re.compile(r'^([\D ]*\d*)') | |
def get_file_data(filename, verbose): | |
""" | |
Get chapter info for file | |
Args: | |
:str filename: the filename of the file to split | |
:boolean verbose: true if you want to print errors, false otherwise | |
Returns: | |
:str: the key which contains the chapter data | |
:Element: Element of the data for the chapter | |
""" | |
cmd = ['ffprobe', '-show_format', '-pretty', '-loglevel', 'quiet', filename] | |
try: | |
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
out, err = process.communicate() | |
if err: | |
print('\tSomething went wrong getting file data...') | |
if verbose: | |
print(err) | |
return None, None | |
except (OSError, ValueError, subprocess.CalledProcessError) as error: | |
print('\tSomething went wrong getting file data...') | |
if verbose: | |
print(error) | |
return None, None | |
out = codecs.decode(str(out), 'unicode_escape') | |
for line in out.splitlines(): | |
line = line.encode('utf-8').strip() | |
if '<Name>' in line and '<Time>' in line: | |
line_info = line.split('=') | |
return line_info[0][4:], ElementTree.fromstring(line_info[1]) | |
print('\tSkipping. No chapter metadata found...') | |
return None, None | |
def check_time(time): | |
""" | |
Convert time to be in xx:xx:xx.xxx format -- 1:00:00.000 instead of 60:00.000 | |
Args: | |
:str time: time to be checked | |
Returns: | |
str: fixed time | |
""" | |
split_time = time.split(':') | |
if len(split_time) > 2: | |
return time | |
minutes = int(split_time[-2]) | |
hours = int(minutes / 60) | |
minutes %= 60 | |
seconds = split_time[-1] | |
return '{0:02d}:{1:02d}:{2}'.format(hours, minutes, seconds) | |
def process_chapter_data(xml): | |
""" | |
Gets chapter data from xml | |
Args: | |
:ElementTree xml: xml data containing Markers with Name and Time tags | |
i.e <Marker><Name>{some_name}</Name><Time>{some_time}</Time></Marker> | |
Retuns: | |
:list: list of dicts with name, start_time, and end_time information | |
last object have end_time = None | |
""" | |
data = [] | |
names = [] | |
titles = [] | |
for marker in xml.findall('.//Marker'): | |
# for some reason, there are some chapters with repeating names and incorrect time data | |
# looks like the first one is usually the right one so we'll skip any subsequent ones | |
name = clean_chapter_name(marker.find('Name').text) | |
title = marker.find('Name').text | |
time = check_time(marker.find('Time').text) | |
if not name or name in names: | |
continue | |
# add start time as end time for previous chapter | |
if data: | |
data[-1]['end_time'] = time | |
data.append({'name': name, 'start_time': time, 'title': title}) | |
names.append(name) | |
titles.append(title) | |
if data: | |
data[-1]['end_time'] = None | |
return data | |
def clean_chapter_name(name): | |
""" | |
Clean up chapter name | |
Args: | |
:str name: name to clean | |
Returns: | |
:str: cleaned name | |
""" | |
name = ''.join(filter(lambda x: x in string.printable, name)) | |
return re.sub(r'\W+', '_', name) | |
def split_into_chapters(filename, key, chapter_data, verbose): | |
""" | |
Split file into files by chapter name | |
Args: | |
:str filename: the filename of the file to split | |
:str key: key where the metadata was found | |
:list chapter_data: list of dicts with name, start_time, and end_time information | |
:boolean verbose: true if you want to print errors, false otherwise | |
Returns: | |
:boolean: True if sucessful, False otherwise | |
""" | |
new_file_name = '{1:02d}{2}{3}' | |
success = True | |
for i, chapter in enumerate(chapter_data, start=1): | |
split_filename = os.path.splitext(filename) | |
new_file = new_file_name.format(split_filename[0], i, chapter['name'], split_filename[1]) | |
cmd = ['ffmpeg', '-loglevel', 'error', '-i', filename, '-ss', chapter['start_time']] | |
if chapter['end_time']: | |
cmd += ['-to', chapter['end_time']] | |
title = chapter['title'].encode('utf-8').strip().replace('"', '') | |
cmd += ['-metadata','title={0}'.format(re.sub(r'[^\x00-\x7f]',r'', title)), '-metadata', '{0}=-1'.format(key), '-c', 'copy', new_file] | |
print(cmd) | |
try: | |
process = subprocess.Popen(cmd, stderr=subprocess.PIPE) | |
err = process.communicate()[1] | |
if err: | |
print('\tSomething went wrong splitting chapters...') | |
if verbose: | |
print(err) | |
success = False | |
except (OSError, ValueError, subprocess.CalledProcessError) as error: | |
print('\tSomething went wrong splitting chapters...') | |
if verbose: | |
print(error) | |
success = False | |
print('\tWrote new file for {0} to {1}'.format(chapter['name'], new_file)) | |
return success | |
def get_files_from_dir(directory, recursive): | |
""" | |
Get the files to be processed from a given directory | |
Args: | |
:list input_list: list of files and folders to check | |
:bool recursive: True if you want to recursively check the directories | |
Returns: | |
list: list of supported files in the directory | |
""" | |
print('Getting files to process...') | |
filepaths = [] | |
# if recursive, use os.walk | |
# otherwise, use os.listdir | |
if recursive: | |
for root, _, files in os.walk(directory): | |
for filename in files: | |
filepaths.append(os.path.join(root, filename)) | |
return filepaths | |
for filename in os.listdir(directory): | |
filepath = os.path.join(directory, filename) | |
if os.path.isfile(filepath): | |
filepaths.append(filepath) | |
return filepaths | |
def get_files(input_list, recursive): | |
""" | |
Get list of files to process | |
Args: | |
:list input_list: list of files and folders to check | |
:bool recursive: True if you want to recursively check the directories | |
Returns: | |
list: files to process | |
""" | |
filepaths = [] | |
for path in input_list: | |
if os.path.isfile(path): | |
# check filetype | |
if os.path.splitext(path)[1].lower() not in ALLOWED_FILETYPES: | |
continue | |
# check that file is not already on our list | |
if path in filepaths: | |
continue | |
filepaths.append(path) | |
elif os.path.isdir(path): | |
for filename in get_files_from_dir(path, recursive): | |
# check filetype | |
if os.path.splitext(filename)[1].lower() not in ALLOWED_FILETYPES: | |
continue | |
# check that file is not already on our list | |
if filename in filepaths: | |
continue | |
filepaths.append(filename) | |
return filepaths | |
def get_arguments(): | |
""" | |
Get input arguments | |
Returns: | |
argparse.Namespace: parsed arguments | |
""" | |
parser = ArgumentParser(description='Separate an audiobook into files of chapters') | |
parser.add_argument(dest='input', help='Input file or directory', nargs='+') | |
parser.add_argument('-d', '--delete-original', dest='delete', | |
help='If files are split, delete original', | |
action='store_true') | |
parser.add_argument('-r', '--recursive', dest='recursive', | |
help='Recurse folders', action='store_true') | |
parser.add_argument('-v', '--verbose', dest='verbose', | |
help='Print errors', action='store_true') | |
return parser.parse_args() | |
def check_ffbinaries(): | |
""" | |
Check if ffprobe and ffmpeg are on system and in PATH | |
""" | |
found = True | |
try: | |
subprocess.Popen('ffprobe', stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
except IOError: | |
print('ffprobe was not found on system. Please install it and make sure it\'s in your PATH.') | |
found = False | |
try: | |
subprocess.Popen('ffmpeg', stdout=subprocess.PIPE, stderr=subprocess.PIPE) | |
except IOError: | |
print('ffmpeg was not found on system. Please install it and make sure it\'s in your PATH.') | |
found = False | |
return found | |
if __name__ == '__main__': | |
if not check_ffbinaries(): | |
exit() | |
ARGS = get_arguments() | |
FILES = get_files(ARGS.input, ARGS.recursive) | |
if not FILES: | |
print('No files found in: {0}'.format(ARGS.input)) | |
exit() | |
for FILE in FILES: | |
print('Processing {0}...'.format(FILE)) | |
KEY, XML = get_file_data(FILE, ARGS.verbose) | |
if XML is None: | |
continue | |
CHAPTER_DATA = process_chapter_data(XML) | |
# no need to split books with one chapter | |
if len(CHAPTER_DATA) < 2: | |
print('\tSkipping. File only has one chapter...') | |
continue | |
if split_into_chapters(FILE, KEY, CHAPTER_DATA, ARGS.verbose) and ARGS.delete: | |
os.remove(FILE) | |
print('\tDeleting {0}...'.format(FILE)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Revised to remove book title from individual file names, the folder name is used for that purpose. The filenames are chapter names only and I added the proper chapter name into the metadata as the title name, the album name contains the book title.