|
import sys |
|
import os |
|
import argparse |
|
|
|
placeholder = 'Paste mojibake here\nここで文字化けをペースト下さい' |
|
default_source_enc = 'sjis' |
|
|
|
# Open a file in binary mode and overwrite contents |
|
# with the text encoded in the given encoding |
|
def write_file(filename, encoding, text): |
|
with open(filename, 'wb') as f: |
|
f.write(text.encode(encoding)) |
|
f.write(b'\n') # newline at end of file |
|
|
|
# Open a file and return a string of the file contents decoded using the specified encoding |
|
def read_file(filename, encoding): |
|
with open(filename, 'r', encoding=encoding) as f: |
|
return f.read() |
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser(description='Fix mojibake characters') |
|
parser.add_argument('filename', metavar='file', |
|
help='file to convert') |
|
parser.add_argument('--create', dest='create_file', action='store_true', |
|
help='create the file to paste mojibake text into, and open it in TextEdit') |
|
parser.add_argument('--overwrite', dest='overwrite_file', action='store_true', |
|
help='(only with --create) overwrite file') |
|
parser.add_argument('--from', dest='from_encoding', |
|
metavar='source_encoding', |
|
default=default_source_enc, |
|
help=f'source encoding (default: {default_source_enc})') |
|
|
|
args = parser.parse_args() |
|
|
|
from_encoding = args.from_encoding |
|
# target_encoding = 'utf-8' |
|
filename = args.filename |
|
create_file = args.create_file |
|
overwrite_file = args.overwrite_file |
|
|
|
if create_file: |
|
if os.path.exists(filename): |
|
if not overwrite_file: |
|
print('File already exists, skipping...') |
|
else: |
|
# 1. Write placeholder text to file, in original encoding |
|
print('Writing placeholder text to file') |
|
write_file(filename, from_encoding, placeholder) |
|
|
|
# 2. Paste manually into TextEdit |
|
print('Please paste the text manually into TextEdit') |
|
print('Press enter when ready to proceed') |
|
os.system('open -a TextEdit ' + filename) |
|
input() |
|
|
|
# 3. Output correctly-decoded file contents |
|
# New encoding should be in utf-8, which (I think) usually corresponds to what Python uses to output and is what most terminals use by default |
|
converted = read_file(filename, from_encoding) |
|
print(converted) |