Created
November 28, 2020 16:42
-
-
Save aGHz/5e7091d07442ec6e72731813576a6f77 to your computer and use it in GitHub Desktop.
Parse vim syntax scripts
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import pprint | |
import re | |
import sys | |
TOK = re.compile(r'\s+') | |
SYN_ARGS = [ | |
'conceal', 'concealends', | |
'display', 'transparent', 'oneline', 'fold', | |
'contained', 'keepend', 'extend', | |
'skipwhite', 'skipnl', 'skipempty', | |
# Bad: | |
'skipwhitecontained', # lex.vim | |
] | |
SYN_OPTS = [ | |
'contains', 'containedin', 'nextgroup', 'cchar', | |
'add', 'remove', | |
# Bad: | |
'nextGroup', # make.vim | |
] | |
SYN_PATTERN_ARGS = ['excludenl'] | |
SYN_PATTERN_OPTS = [ | |
'matchgroup', | |
# Bad: | |
'matchGroup', # sh.vim | |
] | |
def syn_args_match(a, b): | |
for arg in SYN_ARGS: | |
if a.get(arg, None) != b.get(arg, None): | |
return False | |
for opt in SYN_OPTS: | |
if a.get(opt, None) != b.get(opt, None): | |
return False | |
return True | |
def parse_re(s, tokens): | |
re_limit = s[0] | |
while not re.search(r'(?:\\\\|[^\\])' + '\\' + re_limit, s): | |
# There was a space in the regex, pull in the next token | |
# !!! Might have been some other whitespace character but we don't care | |
try: | |
s += ' ' + tokens.pop(0) | |
except Exception as e: | |
print(f'Regex: "{s}"') | |
raise e | |
re_end = s.rindex(re_limit) | |
pattern = s[1:re_end].replace('\\' + re_limit, re_limit) | |
result = {'pattern': pattern} | |
re_options = s[re_end+1:] | |
if re_options: | |
options = {} | |
for opt in re_options.split(','): | |
(k, _, v) = opt.partition('=') | |
options[k] = v | |
result['options'] = options | |
return result | |
def error(s): | |
print(s, file=sys.stderr, flush=True) | |
def load_syn(f): | |
prev_line = '' | |
syntax = { | |
'keyword': {}, | |
'match': {}, | |
'region': {}, | |
'cluster': {} | |
} | |
def parse_prev_line(): | |
nonlocal prev_line | |
nonlocal syntax | |
if not prev_line: | |
return | |
tokens = TOK.split(prev_line)[1:] | |
kind = tokens.pop(0) | |
if kind not in syntax: | |
error(f'Unhandled syntax subcommand {kind}') | |
prev_line = '' | |
return | |
is_keyword = kind == 'keyword' | |
is_match = kind == 'match' | |
is_region = kind == 'region' | |
is_cluster = kind == 'cluster' | |
name = tokens.pop(0) | |
spec = {} | |
if is_keyword: | |
spec['keywords'] = [] | |
elif is_region: | |
spec['start'] = [] | |
spec['end'] = [] | |
# Parse the tokens in the line | |
pattern_args = [] | |
pattern_opts = {} | |
seen_match = False | |
while True: | |
try: | |
token = tokens.pop(0) | |
except IndexError: | |
break | |
if token in SYN_ARGS: | |
spec[token] = True | |
continue | |
(opt, _, val) = token.partition('=') | |
if '=' in token and opt in SYN_OPTS: | |
if not val: | |
# Sometimes people like writing `start= /re/` | |
try: | |
val = tokens.pop(0) | |
except Exception as e: | |
print(prev_line) | |
raise e | |
while val.endswith(',') and tokens: | |
val += tokens.pop(0) | |
spec[opt] = val.split(',') # !!! wrong for cchar=, | |
continue | |
if is_keyword: | |
if token.startswith('"'): | |
# Comment to the end of the line | |
break | |
spec['keywords'].append(token) | |
elif is_match: | |
if token in SYN_PATTERN_ARGS: | |
pattern_args.append(token) | |
continue | |
# no pattern opts for match | |
if seen_match: | |
break | |
try: | |
regex = parse_re(token, tokens) | |
except Exception as e: | |
print(prev_line) | |
raise e | |
for arg in pattern_args: | |
regex[arg] = True | |
spec['match'] = regex | |
seen_match = True | |
elif is_region: | |
if token.startswith('"'): | |
# Comment to the end of the line | |
break | |
if token in SYN_PATTERN_ARGS: | |
pattern_args.append(token) | |
continue | |
if not val: | |
# Sometimes people like writing `start= /re/` | |
val = tokens.pop(0) | |
if opt in SYN_PATTERN_OPTS: | |
if val == 'NONE': | |
pattern_opts.pop(opt, None) | |
continue | |
pattern_opts[opt] = val | |
continue | |
# opt is one of start, skip or end | |
try: | |
regex = parse_re(val, tokens) | |
except Exception as e: | |
print(prev_line) | |
print(tokens) | |
raise e | |
for arg in pattern_args: | |
regex[arg] = True | |
for k, v in pattern_opts.items(): | |
regex[k] = v | |
spec[opt] = regex | |
elif is_cluster: | |
pass # nothing to do, {contains,add,remove} handled by SYN_OPTS | |
# Save the parsed syntax spec | |
if is_cluster: | |
members = set(spec.get('contains') or syntax[kind].get(name, [])) | |
members |= set(spec.get('add', [])) | |
members -= set(spec.get('remove', [])) | |
syntax[kind][name] = list(members) | |
elif name in syntax[kind]: | |
if is_keyword: | |
found = False | |
for prev_spec in syntax[kind][name]: | |
if syn_args_match(spec, prev_spec): | |
found = True | |
prev_spec['keywords'] += spec['keywords'] | |
break | |
if not found: | |
syntax[kind][name].append(spec) | |
elif is_match: | |
syntax[kind][name].append(spec) | |
elif is_region: | |
syntax[kind][name].append(spec) | |
else: | |
syntax[kind][name] = [spec] | |
prev_line = '' | |
def handle_line(line): | |
nonlocal prev_line | |
# Sometimes people like writing `opt = val` | |
# !!! Might change regexes but we don't care | |
content = line.strip().replace(' = ', '=') | |
# Empty line | |
if not content: | |
return | |
# Comment | |
if content[0] == '"': | |
return | |
# Line continuation | |
if content[0] == '\\': | |
if prev_line: | |
if not prev_line.endswith(','): | |
prev_line += ' ' | |
prev_line += content[1:].strip() | |
return | |
parse_prev_line() | |
if content.startswith('syn'): | |
prev_line = content | |
with open(f) as fp: | |
for line in fp: | |
handle_line(line) | |
parse_prev_line() | |
return syntax | |
def dereference_clusters(syntax): | |
def expand_cluster(name): | |
nonlocal syntax | |
expanded = True | |
while expanded: | |
expanded = False | |
for cluster, members in syntax['cluster'].items(): | |
refs = set(c[1:] for c in members if c.startswith('@')) | |
if not refs: | |
continue | |
expanded = True | |
result = set(g for g in members if not g.startswith('@')) | |
for c in refs: | |
if c not in syntax['cluster']: | |
continue | |
result |= set(syntax['cluster'][c]) | |
syntax['cluster'][cluster] = list(sorted(result)) | |
for kind in ['match', 'region']: | |
for _, ms in syntax[kind].items(): | |
for m in ms: | |
members = m.get('contains') | |
if not members: | |
continue | |
refs = set(c[1:] for c in members if c.startswith('@')) | |
if not refs: | |
continue | |
expanded = True | |
result = set(g for g in members if not g.startswith('@')) | |
for c in refs: | |
if c not in syntax['cluster']: | |
continue | |
result |= set(syntax['cluster'][c]) | |
m['contains'] = list(sorted(result)) | |
def main(*files): | |
for f in files: | |
syntax = load_syn(f) | |
dereference_clusters(syntax) | |
# print(json.dumps(syntax, indent=2, ensure_ascii=False)) | |
for matchname, matches in syntax['match'].items(): | |
for match in matches: | |
members = match.get('contains') | |
if not members: | |
continue | |
for member in members: | |
if member not in syntax['region']: | |
continue | |
if any(r for r in syntax['region'][member] if r.get('transparent')): | |
print(f'{f}: match {matchname} contains transparent region {member}') | |
# for regionname, regiones in syntax['region'].items(): | |
# for region in regiones: | |
# members = region.get('contains') | |
# if not members: | |
# continue | |
# for member in members: | |
# if member not in syntax['region']: | |
# continue | |
# if any(r for r in syntax['region'][member] if r.get('transparent')): | |
# print(f'{f}: region {regionname} contains transparent region {member}') | |
# python syntree.py *.vim 2>/dev/null | |
if __name__ == '__main__': | |
main(*sys.argv[1:]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment