Skip to content

Instantly share code, notes, and snippets.

@cicloid
Created June 3, 2025 16:03
Show Gist options
  • Save cicloid/9fb3a143d82cd98884586c3d03f607a1 to your computer and use it in GitHub Desktop.
Save cicloid/9fb3a143d82cd98884586c3d03f607a1 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Remove invisible characters that could be used as digital fingerprints from markdown text.
"""
import sys
import argparse
import re
def clean_invisible_chars(text):
"""Remove invisible Unicode characters that could be used for fingerprinting."""
# Define invisible characters to remove
invisible_chars = [
'\u200b', # Zero-width space
'\u200c', # Zero-width non-joiner
'\u200d', # Zero-width joiner
'\u200e', # Left-to-right mark
'\u200f', # Right-to-left mark
'\u202a', # Left-to-right embedding
'\u202b', # Right-to-left embedding
'\u202c', # Pop directional formatting
'\u202d', # Left-to-right override
'\u202e', # Right-to-left override
'\u2060', # Word joiner
'\u2061', # Function application
'\u2062', # Invisible times
'\u2063', # Invisible separator
'\u2064', # Invisible plus
'\u206a', # Inhibit symmetric swapping
'\u206b', # Activate symmetric swapping
'\u206c', # Inhibit Arabic form shaping
'\u206d', # Activate Arabic form shaping
'\u206e', # National digit shapes
'\u206f', # Nominal digit shapes
'\ufeff', # Zero-width no-break space (BOM)
'\u180e', # Mongolian vowel separator
'\u00ad', # Soft hyphen
]
# Remove each invisible character
cleaned_text = text
for char in invisible_chars:
cleaned_text = cleaned_text.replace(char, '')
# Remove other potential fingerprinting patterns
# Remove multiple consecutive spaces (keep single spaces)
cleaned_text = re.sub(r' {2,}', ' ', cleaned_text)
# Normalize line endings to \n
cleaned_text = cleaned_text.replace('\r\n', '\n').replace('\r', '\n')
# Remove trailing whitespace from lines
lines = cleaned_text.split('\n')
cleaned_lines = [line.rstrip() for line in lines]
cleaned_text = '\n'.join(cleaned_lines)
return cleaned_text
def main():
parser = argparse.ArgumentParser(
description='Remove invisible characters from markdown text that could be used for digital fingerprinting.'
)
parser.add_argument('input', nargs='?', help='Input file (use - for stdin)')
parser.add_argument('-o', '--output', help='Output file (default: stdout)')
parser.add_argument('-i', '--in-place', action='store_true',
help='Modify the input file in place')
args = parser.parse_args()
# Read input
if args.input is None or args.input == '-':
text = sys.stdin.read()
input_file = None
else:
try:
with open(args.input, 'r', encoding='utf-8') as f:
text = f.read()
input_file = args.input
except FileNotFoundError:
print(f"Error: File '{args.input}' not found", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Error reading file: {e}", file=sys.stderr)
sys.exit(1)
# Clean the text
cleaned_text = clean_invisible_chars(text)
# Count removed characters
removed_count = len(text) - len(cleaned_text)
if removed_count > 0:
print(f"Removed {removed_count} invisible characters", file=sys.stderr)
# Write output
if args.in_place:
if input_file is None:
print("Error: Cannot use --in-place with stdin", file=sys.stderr)
sys.exit(1)
try:
with open(input_file, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
print(f"File '{input_file}' cleaned in place", file=sys.stderr)
except Exception as e:
print(f"Error writing file: {e}", file=sys.stderr)
sys.exit(1)
elif args.output:
try:
with open(args.output, 'w', encoding='utf-8') as f:
f.write(cleaned_text)
except Exception as e:
print(f"Error writing file: {e}", file=sys.stderr)
sys.exit(1)
else:
print(cleaned_text, end='')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment