Created
June 3, 2025 16:03
-
-
Save cicloid/9fb3a143d82cd98884586c3d03f607a1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Remove invisible characters that could be used as digital fingerprints from markdown text. | |
""" | |
import sys | |
import argparse | |
import re | |
def clean_invisible_chars(text): | |
"""Remove invisible Unicode characters that could be used for fingerprinting.""" | |
# Define invisible characters to remove | |
invisible_chars = [ | |
'\u200b', # Zero-width space | |
'\u200c', # Zero-width non-joiner | |
'\u200d', # Zero-width joiner | |
'\u200e', # Left-to-right mark | |
'\u200f', # Right-to-left mark | |
'\u202a', # Left-to-right embedding | |
'\u202b', # Right-to-left embedding | |
'\u202c', # Pop directional formatting | |
'\u202d', # Left-to-right override | |
'\u202e', # Right-to-left override | |
'\u2060', # Word joiner | |
'\u2061', # Function application | |
'\u2062', # Invisible times | |
'\u2063', # Invisible separator | |
'\u2064', # Invisible plus | |
'\u206a', # Inhibit symmetric swapping | |
'\u206b', # Activate symmetric swapping | |
'\u206c', # Inhibit Arabic form shaping | |
'\u206d', # Activate Arabic form shaping | |
'\u206e', # National digit shapes | |
'\u206f', # Nominal digit shapes | |
'\ufeff', # Zero-width no-break space (BOM) | |
'\u180e', # Mongolian vowel separator | |
'\u00ad', # Soft hyphen | |
] | |
# Remove each invisible character | |
cleaned_text = text | |
for char in invisible_chars: | |
cleaned_text = cleaned_text.replace(char, '') | |
# Remove other potential fingerprinting patterns | |
# Remove multiple consecutive spaces (keep single spaces) | |
cleaned_text = re.sub(r' {2,}', ' ', cleaned_text) | |
# Normalize line endings to \n | |
cleaned_text = cleaned_text.replace('\r\n', '\n').replace('\r', '\n') | |
# Remove trailing whitespace from lines | |
lines = cleaned_text.split('\n') | |
cleaned_lines = [line.rstrip() for line in lines] | |
cleaned_text = '\n'.join(cleaned_lines) | |
return cleaned_text | |
def main(): | |
parser = argparse.ArgumentParser( | |
description='Remove invisible characters from markdown text that could be used for digital fingerprinting.' | |
) | |
parser.add_argument('input', nargs='?', help='Input file (use - for stdin)') | |
parser.add_argument('-o', '--output', help='Output file (default: stdout)') | |
parser.add_argument('-i', '--in-place', action='store_true', | |
help='Modify the input file in place') | |
args = parser.parse_args() | |
# Read input | |
if args.input is None or args.input == '-': | |
text = sys.stdin.read() | |
input_file = None | |
else: | |
try: | |
with open(args.input, 'r', encoding='utf-8') as f: | |
text = f.read() | |
input_file = args.input | |
except FileNotFoundError: | |
print(f"Error: File '{args.input}' not found", file=sys.stderr) | |
sys.exit(1) | |
except Exception as e: | |
print(f"Error reading file: {e}", file=sys.stderr) | |
sys.exit(1) | |
# Clean the text | |
cleaned_text = clean_invisible_chars(text) | |
# Count removed characters | |
removed_count = len(text) - len(cleaned_text) | |
if removed_count > 0: | |
print(f"Removed {removed_count} invisible characters", file=sys.stderr) | |
# Write output | |
if args.in_place: | |
if input_file is None: | |
print("Error: Cannot use --in-place with stdin", file=sys.stderr) | |
sys.exit(1) | |
try: | |
with open(input_file, 'w', encoding='utf-8') as f: | |
f.write(cleaned_text) | |
print(f"File '{input_file}' cleaned in place", file=sys.stderr) | |
except Exception as e: | |
print(f"Error writing file: {e}", file=sys.stderr) | |
sys.exit(1) | |
elif args.output: | |
try: | |
with open(args.output, 'w', encoding='utf-8') as f: | |
f.write(cleaned_text) | |
except Exception as e: | |
print(f"Error writing file: {e}", file=sys.stderr) | |
sys.exit(1) | |
else: | |
print(cleaned_text, end='') | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment