Last active
May 17, 2025 17:34
-
-
Save nerun/8318924aa35f3f27231f86468804cc8c to your computer and use it in GitHub Desktop.
The purpose of this script is to analyze plain text files (with or without the ".txt" extension) looking for broken paragraphs, i.e., paragraphs splited in more than one line, and join them in a single very long line. It's markdown friendly.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/zsh | |
################################################################################ | |
# PARAGRAPHER revision 20 | |
# By Daniel "Nerun" Rodrigues | |
# May 17, 2025 | |
# https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c | |
# | |
# The purpose of this script is to analyze plain text files (with or without the | |
# ".txt" extension) looking for broken paragraphs, i.e., paragraphs splited in | |
# more than one line, and join them in a single very long line. | |
# | |
# This program is free software; you can redistribute it and/or modify it under | |
# the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain | |
# Dedication (https://creativecommons.org/publicdomain/zero/1.0/). | |
################################################################################ | |
if [[ -z "$*" ]] || [[ "$1" = "-h" ]] || [[ "$1" = "--help" ]]; then | |
cat << EOF | |
Paragrapher processes plain text files to fix broken paragraphs — lines split | |
where they shouldn’t be — and joins them into single, long lines. | |
Usage: paragrapher [OPTION] <FILE> | |
-c, --columns Set minimum line length to detect paragraphs. Default is 72 | |
columns. Use lower values (50 or 60) for shorter paragraphs. | |
Example: paragrapher -c 60 filename | |
-h, --help Show this help message. | |
-v, --version Show version, author, URL, and license info. | |
The output is always saved as filename_paragraphed. | |
Works well with Markdown — recognizes headings and lists. Run it directly on | |
".md" files: paragrapher filename.md. | |
EOF | |
exit 0 | |
elif [[ "$1" = "-v" || "$1" = "--version" ]]; then | |
cat << EOF | |
PARAGRAPHER revision 20 (May 17th, 2025) | |
By Daniel "Nerun" Rodrigues | |
https://gist.github.com/nerun/8318924aa35f3f27231f86468804cc8c | |
This program is free software; you can redistribute it and/or modify it under | |
the terms of the Creative Commons Zero 1.0 Universal (CC0 1.0) Public Domain | |
Dedication (https://creativecommons.org/publicdomain/zero/1.0/). | |
EOF | |
exit 0 | |
elif [[ "$1" = "-c" || "$1" = "--columns" ]]; then | |
if [[ -n "$2" && "$2" == <-> && -n "$3" ]]; then | |
columns=$2 | |
filename="$3" | |
else | |
echo "Error: incorrect usage. Example: paragrapher -c 72 file.txt" | |
exit 1 | |
fi | |
else | |
columns=72 | |
filename="$1" | |
fi | |
if [[ -z "$filename" ]]; then | |
echo "Error: no input file specified." | |
exit 1 | |
fi | |
# Check if file exists | |
if [[ ! -f "$filename" ]]; then | |
echo "Error: there's no file \"$filename\"." | |
exit 1 | |
else | |
# Check if file is a plain text file | |
file_test=$(file -b "$filename") | |
echo "$file_test" | |
if [[ "$file_test" != *"text"* ]]; then | |
echo "Error: \"$filename\" is not a plain text file." | |
echo "(detected: \"$file_test\")." | |
exit 1 | |
else | |
# Check if it is UTF-8 | |
if [[ "$file_test" != *"UTF-8"* || "$file_test" == *"with CRLF line terminators"* ]]; then | |
if command -v dos2unix >/dev/null 2>&1; then | |
dos2unix "$filename" | |
else | |
echo "Consider installing \"dos2unix\"." | |
fi | |
fi | |
fi | |
fi | |
# Verify filename extension, if any | |
if [[ "$filename" == *"."* ]]; then | |
ext="${filename##*.}" | |
ext="${ext:l}" | |
output="${filename%.*}_paragraphed.$ext" | |
else # filename has no extension | |
ext="txt" | |
output="${filename}_paragraphed.$ext" | |
fi | |
# Add an empty line to the end of the file to avoid paragrapher not reading the | |
# last line | |
last_line=$(tail -n 1 "$filename") | |
if [[ -n "$last_line" ]]; then | |
echo "" >> "$filename" | |
fi | |
# Create or clean the output file | |
:> $output | |
paragraph='' | |
_dump_paragraph() { | |
if [[ -n "$paragraph" ]]; then | |
echo "$paragraph" >> "$output" | |
echo "" >> "$output" | |
paragraph='' | |
fi | |
} | |
# https://mywiki.wooledge.org/BashFAQ/001#Trimming | |
# while IFS= read -r line; do | |
while read -r line; do # reads file filename, then, for each line it does: | |
size=${#line} # get the length of the line | |
# if the paragraph is empty, and in the first iteration it will be, then do | |
if [[ -z "$paragraph" ]]; then | |
# if the line length is greater than the columns variable | |
if [[ $size -ge $columns ]]; then | |
paragraph="$line" | |
else # 0 <= size <= 71 | |
echo "$line" >> $output | |
fi | |
# is not the 1st iteration, paragraph already has content | |
else | |
# If the last character of paragraph does not match a period (.) or a | |
# colon (:), execute the block. | |
last_char="${paragraph: -1}" | |
if [[ "$last_char" != "." && "$last_char" != ":" ]]; then | |
if [[ "$ext" = "md" ]]; then | |
case "${line:0:7}" in | |
\#\ * | \ | |
\#\#\ * | \ | |
\#\#\#\ * | \ | |
\#\#\#\#\ * | \ | |
\#\#\#\#\#\ * | \ | |
\#\#\#\#\#\#\ * | \ | |
"•"[$'\t ']* | \ | |
"-"[$'\t ']* ) | |
_dump_paragraph | |
if [[ "${line:0:1}" = "#" ]]; then | |
echo "$line" >> $output | |
echo "" >> $output | |
else | |
paragraph='' | |
paragraph+="$line" | |
fi | |
;; | |
* ) | |
paragraph+=" $line" | |
;; | |
esac | |
else | |
paragraph+=" $line" | |
fi | |
else | |
_dump_paragraph | |
# if the line length is greater than the columns variable | |
if [[ $size -ge $columns ]]; then | |
# concatenate the new line with the previous ones, creating a | |
# paragraph | |
paragraph+="$line" | |
else # 0 <= size <= 71 | |
# if not greater, just write it to file | |
if [[ $size -ne 0 ]]; then | |
echo "$line" >> $output | |
fi | |
fi | |
fi | |
fi | |
done < "$filename" | |
# Clean output file | |
sed -ri "s/’|‘/'/g" $output | |
sed -ri "s/“|”/\"/g" $output | |
sed -i s/$'\u000c'//g $output # Form feed (U+000C) | |
sed -i "s/ -- / – /g" $output # En Dash (U+2013) | |
sed -i '/ --[[:digit:]]/ s/--/–/g' $output # En Dash (U+2013) representing a minus sign | |
if [[ "$ext" = "md" ]]; then | |
sed -i 's/^[•-] [ \t]*/ - /g' $output # Bullet (U+2022) or list | |
fi | |
echo "" >> $output # i like to end files with empty line |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It is fully compatible with BASH 5+. Just change the shebang to
#!/usr/bin/env bash