-
-
Save amerberg/a273ca1e579ab573b499 to your computer and use it in GitHub Desktop.
| import ply.lex, argparse, io | |
| #Usage | |
| # python stripcomments.py input.tex > output.tex | |
| # python stripcomments.py input.tex -e encoding > output.tex | |
| #This utility is released under the WTFPL license: http://www.wtfpl.net/about/ | |
| def strip_comments(source): | |
| tokens = ( | |
| 'PERCENT', 'BEGINCOMMENT', 'ENDCOMMENT', 'BACKSLASH', | |
| 'CHAR', 'BEGINVERBATIM', 'ENDVERBATIM', 'NEWLINE', 'ESCPCT', | |
| ) | |
| states = ( | |
| ('linecomment', 'exclusive'), | |
| ('commentenv', 'exclusive'), | |
| ('verbatim', 'exclusive') | |
| ) | |
| #Deal with escaped backslashes, so we don't think they're escaping %. | |
| def t_ANY_BACKSLASH(t): | |
| r"\\\\" | |
| return t | |
| #One-line comments | |
| def t_PERCENT(t): | |
| r"\%" | |
| t.lexer.begin("linecomment") | |
| #Escaped percent signs | |
| def t_ESCPCT(t): | |
| r"\\\%" | |
| return t | |
| #Comment environment, as defined by verbatim package | |
| def t_BEGINCOMMENT(t): | |
| r"\\begin\s*{\s*comment\s*}" | |
| t.lexer.begin("commentenv") | |
| #Verbatim environment (different treatment of comments within) | |
| def t_BEGINVERBATIM(t): | |
| r"\\begin\s*{\s*verbatim\s*}" | |
| t.lexer.begin("verbatim") | |
| return t | |
| #Any other character in initial state we leave alone | |
| def t_CHAR(t): | |
| r"." | |
| return t | |
| def t_NEWLINE(t): | |
| r"\n" | |
| return t | |
| #End comment environment | |
| def t_commentenv_ENDCOMMENT(t): | |
| r"\\end\s*{\s*comment\s*}" | |
| #Anything after \end{comment} on a line is ignored! | |
| t.lexer.begin('linecomment') | |
| #Ignore comments of comment environment | |
| def t_commentenv_CHAR(t): | |
| r"." | |
| pass | |
| def t_commentenv_NEWLINE(t): | |
| r"\n" | |
| pass | |
| #End of verbatim environment | |
| def t_verbatim_ENDVERBATIM(t): | |
| r"\\end\s*{\s*verbatim\s*}" | |
| t.lexer.begin('INITIAL') | |
| return t | |
| #Leave contents of verbatim environment alone | |
| def t_verbatim_CHAR(t): | |
| r"." | |
| return t | |
| def t_verbatim_NEWLINE(t): | |
| r"\n" | |
| return t | |
| #End a % comment when we get to a new line | |
| def t_linecomment_ENDCOMMENT(t): | |
| r"\n" | |
| t.lexer.begin("INITIAL") | |
| #Newline at the end of a line comment is stripped. | |
| #Ignore anything after a % on a line | |
| def t_linecomment_CHAR(t): | |
| r"." | |
| pass | |
| lexer = ply.lex.lex() | |
| lexer.input(source) | |
| return u"".join([tok.value for tok in lexer]) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('filename', help = 'the file to strip comments from') | |
| parser.add_argument('--encoding', '-e', default='utf-8') | |
| args = parser.parse_args() | |
| with io.open(args.filename, encoding=args.encoding) as f: | |
| source = f.read() | |
| print(strip_comments(source)) | |
| if __name__ == '__main__': | |
| main() |
Console says:
WARNING: No t_error rule is defined
WARNING: No error rule is defined for exclusive state 'verbatim'
WARNING: No error rule is defined for exclusive state 'commentenv'
WARNING: No error rule is defined for exclusive state 'linecomment'
Traceback (most recent call last):
File "/Users/evlogii/Downloads/strip_comments.py", line 111, in
main()
File "/Users/evlogii/Downloads/strip_comments.py", line 108, in main
print(strip_comments(source))
UnicodeEncodeError: 'ascii' codec can't encode characters in position 279-285: ordinal not in range(128)
my file is in utf-8 (contains some cyrillic symbols) and I do just python stripcomments.py input.tex > output.tex
any suggestions? =/
The % at the end of 2nd line need to be preserved or the code can't run, how to solve that?
\makeatletter
\def\alloc@#1#2#3#4#5%
{\ifnum\count1#1<#4% make sure there's still room
\allocationnumber\count1#1
\global\advance\count1#1\@ne
\global#3#5\allocationnumber
\wlog{\string#5=\string#2\the\allocationnumber}%
\else\ifnum#1<6
\def\etex@dummy@definition{}% <-- code added
\begingroup \escapechar\m@ne
\expandafter\alloc@@\expandafter{\string#2}#5%
\else\errmessage{No room for a new #2}\fi\fi
}
\makeatother
UPDATE:
I've modified the snippet to solve my problem, along with @m3phisto's suggestion.
https://gist.github.com/dzhuang/dc34cdd7efa43e5ecc1dc981cc906c85
Thank you for the useful code.
For anyone stumbling over this in the future: latexpand can reliably remove comments, too.
Wow, I completely forgot about this and didn't see all these comments. Thanks to everyone who has made improvements. I've added a comment to clarify the licensing situation.
To remove all the comments from a latex file, another option is to use use arxiv-latex-cleaner. Actively maintained, 1.2k GitHub stars, written in Python but no need to know Python.
I believe, the line "def t_ANY_BACKSLASH(t):" should be changed to "def t_BACKSLASH(t):". Otherwise, double backslashes appearing in a linecomment are written to the output. Apart from that, very useful and easily extensible. Thank you!