aGHz · November 28, 2020 16:42
diff --git a/syntree.py b/syntree.py
 import json
 import pprint
 import re
 import sys


 TOK = re.compile(r'\s+')

 SYN_ARGS = [
    'conceal', 'concealends',
    'display', 'transparent', 'oneline', 'fold',
    'contained', 'keepend', 'extend',
    'skipwhite', 'skipnl', 'skipempty',
    # Bad:
    'skipwhitecontained',  # lex.vim
 ]
 SYN_OPTS = [
    'contains', 'containedin', 'nextgroup', 'cchar',
    'add', 'remove',
    # Bad:
    'nextGroup',  # make.vim
 ]
 SYN_PATTERN_ARGS = ['excludenl']
 SYN_PATTERN_OPTS = [
    'matchgroup',
    # Bad:
    'matchGroup',  # sh.vim
 ]

 def syn_args_match(a, b):
    for arg in SYN_ARGS:
        if a.get(arg, None) != b.get(arg, None):
            return False
    for opt in SYN_OPTS:
        if a.get(opt, None) != b.get(opt, None):
            return False
    return True

 def parse_re(s, tokens):
    re_limit = s[0]
    while not re.search(r'(?:\\\\|[^\\])' + '\\' + re_limit, s):
        # There was a space in the regex, pull in the next token
        # !!! Might have been some other whitespace character but we don't care
        try:
            s += ' ' + tokens.pop(0)
        except Exception as e:
            print(f'Regex: "{s}"')
            raise e

    re_end = s.rindex(re_limit)
    pattern = s[1:re_end].replace('\\' + re_limit, re_limit)
    result = {'pattern': pattern}

    re_options = s[re_end+1:]
    if re_options:
        options = {}
        for opt in re_options.split(','):
            (k, _, v) = opt.partition('=')
            options[k] = v
        result['options'] = options
    return result

 def error(s):
    print(s, file=sys.stderr, flush=True)


 def load_syn(f):
    prev_line = ''
    syntax = {
        'keyword': {},
        'match': {},
        'region': {},
        'cluster': {}
    }

    def parse_prev_line():
        nonlocal prev_line
        nonlocal syntax
        if not prev_line:
            return

        tokens = TOK.split(prev_line)[1:]

        kind = tokens.pop(0)
        if kind not in syntax:
            error(f'Unhandled syntax subcommand {kind}')
            prev_line = ''
            return
        is_keyword = kind == 'keyword'
        is_match = kind == 'match'
        is_region = kind == 'region'
        is_cluster = kind == 'cluster'

        name = tokens.pop(0)

        spec = {}
        if is_keyword:
            spec['keywords'] = []
        elif is_region:
            spec['start'] = []
            spec['end'] = []

        # Parse the tokens in the line
        pattern_args = []
        pattern_opts = {}
        seen_match = False
        while True:
            try:
                token = tokens.pop(0)
            except IndexError:
                break
            if token in SYN_ARGS:
                spec[token] = True
                continue

            (opt, _, val) = token.partition('=')
            if '=' in token and opt in SYN_OPTS:
                if not val:
                    # Sometimes people like writing `start= /re/`
                    try:
                        val = tokens.pop(0)
                    except Exception as e:
                        print(prev_line)
                        raise e
                while val.endswith(',') and tokens:
                    val += tokens.pop(0)
                spec[opt] = val.split(',')  # !!! wrong for cchar=,
                continue

            if is_keyword:
                if token.startswith('"'):
                    # Comment to the end of the line
                    break

                spec['keywords'].append(token)
            elif is_match:
                if token in SYN_PATTERN_ARGS:
                    pattern_args.append(token)
                    continue
                # no pattern opts for match

                if seen_match:
                    break

                try:
                    regex = parse_re(token, tokens)
                except Exception as e:
                    print(prev_line)
                    raise e
                for arg in pattern_args:
                    regex[arg] = True
                spec['match'] = regex
                seen_match = True
            elif is_region:
                if token.startswith('"'):
                    # Comment to the end of the line
                    break

                if token in SYN_PATTERN_ARGS:
                    pattern_args.append(token)
                    continue

                if not val:
                    # Sometimes people like writing `start= /re/`
                    val = tokens.pop(0)

                if opt in SYN_PATTERN_OPTS:
                    if val == 'NONE':
                        pattern_opts.pop(opt, None)
                        continue
                    pattern_opts[opt] = val
                    continue
                # opt is one of start, skip or end
                try:
                    regex = parse_re(val, tokens)
                except Exception as e:
                    print(prev_line)
                    print(tokens)
                    raise e
                for arg in pattern_args:
                    regex[arg] = True
                for k, v in pattern_opts.items():
                    regex[k] = v
                spec[opt] = regex
            elif is_cluster:
                pass  # nothing to do, {contains,add,remove} handled by SYN_OPTS

        # Save the parsed syntax spec
        if is_cluster:
            members = set(spec.get('contains') or syntax[kind].get(name, []))
            members |= set(spec.get('add', []))
            members -= set(spec.get('remove', []))
            syntax[kind][name] = list(members)
        elif name in syntax[kind]:
            if is_keyword:
                found = False
                for prev_spec in syntax[kind][name]:
                    if syn_args_match(spec, prev_spec):
                        found = True
                        prev_spec['keywords'] += spec['keywords']
                        break
                if not found:
                    syntax[kind][name].append(spec)
            elif is_match:
                syntax[kind][name].append(spec)
            elif is_region:
                syntax[kind][name].append(spec)
        else:
            syntax[kind][name] = [spec]

        prev_line = ''

    def handle_line(line):
        nonlocal prev_line
        # Sometimes people like writing `opt = val`
        # !!! Might change regexes but we don't care
        content = line.strip().replace(' = ', '=')

        # Empty line
        if not content:
            return
        # Comment
        if content[0] == '"':
            return
        # Line continuation
        if content[0] == '\\':
            if prev_line:
                if not prev_line.endswith(','):
                    prev_line += ' '
                prev_line += content[1:].strip()
            return

        parse_prev_line()

        if content.startswith('syn'):
            prev_line = content

    with open(f) as fp:
        for line in fp:
            handle_line(line)
        parse_prev_line()

    return syntax

 def dereference_clusters(syntax):
    def expand_cluster(name):
        nonlocal syntax

    expanded = True
    while expanded:
        expanded = False
        for cluster, members in syntax['cluster'].items():
            refs = set(c[1:] for c in members if c.startswith('@'))
            if not refs:
                continue

            expanded = True
            result = set(g for g in members if not g.startswith('@'))
            for c in refs:
                if c not in syntax['cluster']:
                    continue
                result |= set(syntax['cluster'][c])
            syntax['cluster'][cluster] = list(sorted(result))

    for kind in ['match', 'region']:
        for _, ms in syntax[kind].items():
            for m in ms:
                members = m.get('contains')
                if not members:
                    continue

                refs = set(c[1:] for c in members if c.startswith('@'))
                if not refs:
                    continue

                expanded = True
                result = set(g for g in members if not g.startswith('@'))
                for c in refs:
                    if c not in syntax['cluster']:
                        continue
                    result |= set(syntax['cluster'][c])
                m['contains'] = list(sorted(result))


 def main(*files):
    for f in files:
        syntax = load_syn(f)
        dereference_clusters(syntax)

        # print(json.dumps(syntax, indent=2, ensure_ascii=False))

        for matchname, matches in syntax['match'].items():
            for match in matches:
                members = match.get('contains')
                if not members:
                    continue

                for member in members:
                    if member not in syntax['region']:
                        continue

                    if any(r for r in syntax['region'][member] if r.get('transparent')):
                        print(f'{f}: match {matchname} contains transparent region {member}')

        # for regionname, regiones in syntax['region'].items():
        #     for region in regiones:
        #         members = region.get('contains')
        #         if not members:
        #             continue

        #         for member in members:
        #             if member not in syntax['region']:
        #                 continue

        #             if any(r for r in syntax['region'][member] if r.get('transparent')):
        #                 print(f'{f}: region {regionname} contains transparent region {member}')


 #  python syntree.py *.vim 2>/dev/null
 if __name__ == '__main__':
    main(*sys.argv[1:])
	import json
	import pprint
	import re
	import sys


	TOK = re.compile(r'\s+')

	SYN_ARGS = [
	'conceal', 'concealends',
	'display', 'transparent', 'oneline', 'fold',
	'contained', 'keepend', 'extend',
	'skipwhite', 'skipnl', 'skipempty',
	# Bad:
	'skipwhitecontained', # lex.vim
	]
	SYN_OPTS = [
	'contains', 'containedin', 'nextgroup', 'cchar',
	'add', 'remove',
	# Bad:
	'nextGroup', # make.vim
	]
	SYN_PATTERN_ARGS = ['excludenl']
	SYN_PATTERN_OPTS = [
	'matchgroup',
	# Bad:
	'matchGroup', # sh.vim
	]

	def syn_args_match(a, b):
	for arg in SYN_ARGS:
	if a.get(arg, None) != b.get(arg, None):
	return False
	for opt in SYN_OPTS:
	if a.get(opt, None) != b.get(opt, None):
	return False
	return True

	def parse_re(s, tokens):
	re_limit = s[0]
	while not re.search(r'(?:\\\\\|[^\\])' + '\\' + re_limit, s):
	# There was a space in the regex, pull in the next token
	# !!! Might have been some other whitespace character but we don't care
	try:
	s += ' ' + tokens.pop(0)
	except Exception as e:
	print(f'Regex: "{s}"')
	raise e

	re_end = s.rindex(re_limit)
	pattern = s[1:re_end].replace('\\' + re_limit, re_limit)
	result = {'pattern': pattern}

	re_options = s[re_end+1:]
	if re_options:
	options = {}
	for opt in re_options.split(','):
	(k, _, v) = opt.partition('=')
	options[k] = v
	result['options'] = options
	return result

	def error(s):
	print(s, file=sys.stderr, flush=True)


	def load_syn(f):
	prev_line = ''
	syntax = {
	'keyword': {},
	'match': {},
	'region': {},
	'cluster': {}
	}

	def parse_prev_line():
	nonlocal prev_line
	nonlocal syntax
	if not prev_line:
	return

	tokens = TOK.split(prev_line)[1:]

	kind = tokens.pop(0)
	if kind not in syntax:
	error(f'Unhandled syntax subcommand {kind}')
	prev_line = ''
	return
	is_keyword = kind == 'keyword'
	is_match = kind == 'match'
	is_region = kind == 'region'
	is_cluster = kind == 'cluster'

	name = tokens.pop(0)

	spec = {}
	if is_keyword:
	spec['keywords'] = []
	elif is_region:
	spec['start'] = []
	spec['end'] = []

	# Parse the tokens in the line
	pattern_args = []
	pattern_opts = {}
	seen_match = False
	while True:
	try:
	token = tokens.pop(0)
	except IndexError:
	break
	if token in SYN_ARGS:
	spec[token] = True
	continue

	(opt, _, val) = token.partition('=')
	if '=' in token and opt in SYN_OPTS:
	if not val:
	# Sometimes people like writing `start= /re/`
	try:
	val = tokens.pop(0)
	except Exception as e:
	print(prev_line)
	raise e
	while val.endswith(',') and tokens:
	val += tokens.pop(0)
	spec[opt] = val.split(',') # !!! wrong for cchar=,
	continue

	if is_keyword:
	if token.startswith('"'):
	# Comment to the end of the line
	break

	spec['keywords'].append(token)
	elif is_match:
	if token in SYN_PATTERN_ARGS:
	pattern_args.append(token)
	continue
	# no pattern opts for match

	if seen_match:
	break

	try:
	regex = parse_re(token, tokens)
	except Exception as e:
	print(prev_line)
	raise e
	for arg in pattern_args:
	regex[arg] = True
	spec['match'] = regex
	seen_match = True
	elif is_region:
	if token.startswith('"'):
	# Comment to the end of the line
	break

	if token in SYN_PATTERN_ARGS:
	pattern_args.append(token)
	continue

	if not val:
	# Sometimes people like writing `start= /re/`
	val = tokens.pop(0)

	if opt in SYN_PATTERN_OPTS:
	if val == 'NONE':
	pattern_opts.pop(opt, None)
	continue
	pattern_opts[opt] = val
	continue
	# opt is one of start, skip or end
	try:
	regex = parse_re(val, tokens)
	except Exception as e:
	print(prev_line)
	print(tokens)
	raise e
	for arg in pattern_args:
	regex[arg] = True
	for k, v in pattern_opts.items():
	regex[k] = v
	spec[opt] = regex
	elif is_cluster:
	pass # nothing to do, {contains,add,remove} handled by SYN_OPTS

	# Save the parsed syntax spec
	if is_cluster:
	members = set(spec.get('contains') or syntax[kind].get(name, []))
	members \|= set(spec.get('add', []))
	members -= set(spec.get('remove', []))
	syntax[kind][name] = list(members)
	elif name in syntax[kind]:
	if is_keyword:
	found = False
	for prev_spec in syntax[kind][name]:
	if syn_args_match(spec, prev_spec):
	found = True
	prev_spec['keywords'] += spec['keywords']
	break
	if not found:
	syntax[kind][name].append(spec)
	elif is_match:
	syntax[kind][name].append(spec)
	elif is_region:
	syntax[kind][name].append(spec)
	else:
	syntax[kind][name] = [spec]

	prev_line = ''

	def handle_line(line):
	nonlocal prev_line
	# Sometimes people like writing `opt = val`
	# !!! Might change regexes but we don't care
	content = line.strip().replace(' = ', '=')

	# Empty line
	if not content:
	return
	# Comment
	if content[0] == '"':
	return
	# Line continuation
	if content[0] == '\\':
	if prev_line:
	if not prev_line.endswith(','):
	prev_line += ' '
	prev_line += content[1:].strip()
	return

	parse_prev_line()

	if content.startswith('syn'):
	prev_line = content

	with open(f) as fp:
	for line in fp:
	handle_line(line)
	parse_prev_line()

	return syntax

	def dereference_clusters(syntax):
	def expand_cluster(name):
	nonlocal syntax

	expanded = True
	while expanded:
	expanded = False
	for cluster, members in syntax['cluster'].items():
	refs = set(c[1:] for c in members if c.startswith('@'))
	if not refs:
	continue

	expanded = True
	result = set(g for g in members if not g.startswith('@'))
	for c in refs:
	if c not in syntax['cluster']:
	continue
	result \|= set(syntax['cluster'][c])
	syntax['cluster'][cluster] = list(sorted(result))

	for kind in ['match', 'region']:
	for _, ms in syntax[kind].items():
	for m in ms:
	members = m.get('contains')
	if not members:
	continue

	refs = set(c[1:] for c in members if c.startswith('@'))
	if not refs:
	continue

	expanded = True
	result = set(g for g in members if not g.startswith('@'))
	for c in refs:
	if c not in syntax['cluster']:
	continue
	result \|= set(syntax['cluster'][c])
	m['contains'] = list(sorted(result))


	def main(*files):
	for f in files:
	syntax = load_syn(f)
	dereference_clusters(syntax)

	# print(json.dumps(syntax, indent=2, ensure_ascii=False))

	for matchname, matches in syntax['match'].items():
	for match in matches:
	members = match.get('contains')
	if not members:
	continue

	for member in members:
	if member not in syntax['region']:
	continue

	if any(r for r in syntax['region'][member] if r.get('transparent')):
	print(f'{f}: match {matchname} contains transparent region {member}')

	# for regionname, regiones in syntax['region'].items():
	# for region in regiones:
	# members = region.get('contains')
	# if not members:
	# continue

	# for member in members:
	# if member not in syntax['region']:
	# continue

	# if any(r for r in syntax['region'][member] if r.get('transparent')):
	# print(f'{f}: region {regionname} contains transparent region {member}')


	# python syntree.py *.vim 2>/dev/null
	if __name__ == '__main__':
	main(*sys.argv[1:])