gwenzek · April 1, 2019 16:18
diff --git a/flexible_json_decoder.py b/flexible_json_decoder.py
 """Implementation of a flexible JSON decoder

 import json
 from json_decoder import FlexibleJSONDecoder

 j = json.loads(raw, cls=FlexibleJSONDecoder)
 """
 import json
 import re

 from json import scanner, JSONDecodeError
 try:
    from _json import scanstring as c_scanstring
 except ImportError:
    c_scanstring = None

 FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL

 NaN = float('nan')
 PosInf = float('inf')
 NegInf = float('-inf')

 _CONSTANTS = {
    '-Infinity': NegInf,
    'Infinity': PosInf,
    'NaN': NaN,
 }


 STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
 BACKSLASH = {
    '"': '"', '\\': '\\', '/': '/',
    'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
 }

 def _decode_uXXXX(s, pos):
    esc = s[pos + 1:pos + 5]
    if len(esc) == 4 and esc[1] not in 'xX':
        try:
            return int(esc, 16)
        except ValueError:
            pass
    msg = "Invalid \\uXXXX escape"
    raise JSONDecodeError(msg, s, pos)

 def py_scanstring(s, end, strict=True,
        _b=BACKSLASH, _m=STRINGCHUNK.match):
    """Scan the string s for a JSON string. End is the index of the
    character in s after the quote that started the JSON string.
    Unescapes all valid JSON string escape sequences and raises ValueError
    on attempt to decode an invalid string. If strict is False then literal
    control characters are allowed in the string.

    Returns a tuple of the decoded string and the index of the character in s
    after the end quote."""
    chunks = []
    _append = chunks.append
    begin = end - 1
    while 1:
        chunk = _m(s, end)
        if chunk is None:
            raise JSONDecodeError("Unterminated string starting at", s, begin)
        end = chunk.end()
        content, terminator = chunk.groups()
        # Content is contains zero or more unescaped string characters
        if content:
            _append(content)
        # Terminator is the end of string, a literal control character,
        # or a backslash denoting that an escape sequence follows
        if terminator == '"':
            break
        elif terminator != '\\':
            if strict:
                #msg = "Invalid control character %r at" % (terminator,)
                msg = "Invalid control character {0!r} at".format(terminator)
                raise JSONDecodeError(msg, s, end)
            else:
                _append(terminator)
                continue
        try:
            esc = s[end]
        except IndexError:
            raise JSONDecodeError("Unterminated string starting at",
                                  s, begin) from None
        # If not a unicode escape sequence, must be in the lookup table
        if esc != 'u':
            try:
                char = _b[esc]
            except KeyError:
                msg = "Invalid \\escape: {0!r}".format(esc)
                raise JSONDecodeError(msg, s, end)
            end += 1
        else:
            uni = _decode_uXXXX(s, end)
            end += 5
            if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
                uni2 = _decode_uXXXX(s, end + 1)
                if 0xdc00 <= uni2 <= 0xdfff:
                    uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00))
                    end += 6
            char = chr(uni)
        _append(char)
    return ''.join(chunks), end


 # Use speedup if available
 scanstring = c_scanstring or py_scanstring

 WHITESPACE = re.compile(r'([ \t\n\r]|//.*?\n)*', FLAGS)
 WHITESPACE_STR = ' \t\n\r/'


 def FlexibleJSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
               memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    s, end = s_and_end
    pairs = []
    pairs_append = pairs.append
    # Backwards compatibility
    if memo is None:
        memo = {}
    memo_get = memo.setdefault
    # Use a slice to prevent IndexError from being raised, the following
    # check will raise a more specific ValueError if the string is empty
    nextchar = s[end:end + 1]
    # Normally we expect nextchar == '"'
    if nextchar != '"':
        if nextchar in _ws:
            end = _w(s, end).end()
            nextchar = s[end:end + 1]
        # Trivial empty object
        if nextchar == '}':
            if object_pairs_hook is not None:
                result = object_pairs_hook(pairs)
                return result, end + 1
            pairs = {}
            if object_hook is not None:
                pairs = object_hook(pairs)
            return pairs, end + 1
        elif nextchar != '"':
            raise JSONDecodeError(
                "Expecting property name enclosed in double quotes", s, end)
    end += 1
    while True:
        key, end = scanstring(s, end, strict)
        key = memo_get(key, key)
        # To skip some function call overhead we optimize the fast paths where
        # the JSON key separator is ": " or just ":".
        if s[end:end + 1] != ':':
            end = _w(s, end).end()
            if s[end:end + 1] != ':':
                raise JSONDecodeError("Expecting ':' delimiter", s, end)
        end += 1

        try:
            if s[end] in _ws:
                end = _w(s, end + 1).end()
        except IndexError:
            pass

        try:
            value, end = scan_once(s, end)
        except StopIteration as err:
            raise JSONDecodeError("Expecting value", s, err.value) from None
        pairs_append((key, value))
        try:
            nextchar = s[end]
            if nextchar in _ws:
                end = _w(s, end + 1).end()
                nextchar = s[end]
        except IndexError:
            nextchar = ''
        end += 1

        if nextchar == '}':
            break
        elif nextchar != ',':
            raise JSONDecodeError("Expecting ',' delimiter", s, end - 1)
        end = _w(s, end).end()
        nextchar = s[end:end + 1]
        end += 1
        if nextchar == '}':
            break
        elif nextchar != '"':
            raise JSONDecodeError(
                "Expecting property name enclosed in double quotes", s, end - 1)
    if object_pairs_hook is not None:
        result = object_pairs_hook(pairs)
        return result, end
    pairs = dict(pairs)
    if object_hook is not None:
        pairs = object_hook(pairs)
    return pairs, end


 def FlexibleJSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
    s, end = s_and_end
    values = []
    nextchar = s[end:end + 1]
    if nextchar in _ws:
        end = _w(s, end + 1).end()
        nextchar = s[end:end + 1]
    # Look-ahead for trivial empty array
    if nextchar == ']':
        return values, end + 1
    _append = values.append
    while True:
        try:
            value, end = scan_once(s, end)
        except StopIteration as err:
            if s[err.value] == ']':
                return values, err.value + 1
            raise JSONDecodeError("Expecting value", s, err.value) from None
        _append(value)
        nextchar = s[end:end + 1]
        if nextchar in _ws:
            end = _w(s, end + 1).end()
            nextchar = s[end:end + 1]
        end += 1
        if nextchar == ']':
            break
        elif nextchar != ',':
            raise JSONDecodeError("Expecting ',' delimiter", s, end - 1)
        try:
            if s[end] in _ws:
                end = _w(s, end + 1).end()
        except IndexError:
            pass

    return values, end


 class FlexibleJSONDecoder(json.JSONDecoder):
    """JSON decoder. It allows for trailing commas."""

    def __init__(self):
        super().__init__()
        self.parse_object = FlexibleJSONObject
        self.parse_array = FlexibleJSONArray
        self.scan_once = scanner.py_make_scanner(self)

    def decode(self, s, _w=WHITESPACE.match):
        """Return the Python representation of ``s`` (a ``str`` instance
        containing a JSON document).

        """
        obj, end = self.raw_decode(s, idx=_w(s, 0).end())
        end = _w(s, end).end()
        if end != len(s):
            raise JSONDecodeError("Extra data", s, end)
        return obj

    def raw_decode(self, s, idx=0):
        """Decode a JSON document from ``s`` (a ``str`` beginning with
        a JSON document) and return a 2-tuple of the Python
        representation and the index in ``s`` where the document ended.

        This can be used to decode a JSON document from a string that may
        have extraneous data at the end.

        """
        try:
            obj, end = self.scan_once(s, idx)
        except StopIteration as err:
            raise JSONDecodeError("Expecting value", s, err.value) from None
        return obj, end

      
 if __name__ == '__main__':
    def test(raw, expected):
        decoded = json.loads(raw, cls=FlexibleJSONDecoder)
        assert decoded == expected

    test('{"foo": "bar"}', {"foo":"bar"})
    test('{"foo": "bar", }', {"foo":"bar"})
    test('["foo", "bar"]', ["foo", "bar"])
    test('["foo", "bar", ]', ["foo", "bar"])
    test('["foo", {"bar": "baz"}]', ["foo", {"bar": "baz"}])
    test('["foo", {"bar": "baz",},]', ["foo", {"bar": "baz"}])

    test('''[
        "foo",
        // Note the comment and the trailing comma
        "bar",
    ]''', ["foo", "bar"])
	"""Implementation of a flexible JSON decoder

	import json
	from json_decoder import FlexibleJSONDecoder

	j = json.loads(raw, cls=FlexibleJSONDecoder)
	"""
	import json
	import re

	from json import scanner, JSONDecodeError
	try:
	from _json import scanstring as c_scanstring
	except ImportError:
	c_scanstring = None

	FLAGS = re.VERBOSE \| re.MULTILINE \| re.DOTALL

	NaN = float('nan')
	PosInf = float('inf')
	NegInf = float('-inf')

	_CONSTANTS = {
	'-Infinity': NegInf,
	'Infinity': PosInf,
	'NaN': NaN,
	}


	STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS)
	BACKSLASH = {
	'"': '"', '\\': '\\', '/': '/',
	'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t',
	}

	def _decode_uXXXX(s, pos):
	esc = s[pos + 1:pos + 5]
	if len(esc) == 4 and esc[1] not in 'xX':
	try:
	return int(esc, 16)
	except ValueError:
	pass
	msg = "Invalid \\uXXXX escape"
	raise JSONDecodeError(msg, s, pos)

	def py_scanstring(s, end, strict=True,
	_b=BACKSLASH, _m=STRINGCHUNK.match):
	"""Scan the string s for a JSON string. End is the index of the
	character in s after the quote that started the JSON string.
	Unescapes all valid JSON string escape sequences and raises ValueError
	on attempt to decode an invalid string. If strict is False then literal
	control characters are allowed in the string.

	Returns a tuple of the decoded string and the index of the character in s
	after the end quote."""
	chunks = []
	_append = chunks.append
	begin = end - 1
	while 1:
	chunk = _m(s, end)
	if chunk is None:
	raise JSONDecodeError("Unterminated string starting at", s, begin)
	end = chunk.end()
	content, terminator = chunk.groups()
	# Content is contains zero or more unescaped string characters
	if content:
	_append(content)
	# Terminator is the end of string, a literal control character,
	# or a backslash denoting that an escape sequence follows
	if terminator == '"':
	break
	elif terminator != '\\':
	if strict:
	#msg = "Invalid control character %r at" % (terminator,)
	msg = "Invalid control character {0!r} at".format(terminator)
	raise JSONDecodeError(msg, s, end)
	else:
	_append(terminator)
	continue
	try:
	esc = s[end]
	except IndexError:
	raise JSONDecodeError("Unterminated string starting at",
	s, begin) from None
	# If not a unicode escape sequence, must be in the lookup table
	if esc != 'u':
	try:
	char = _b[esc]
	except KeyError:
	msg = "Invalid \\escape: {0!r}".format(esc)
	raise JSONDecodeError(msg, s, end)
	end += 1
	else:
	uni = _decode_uXXXX(s, end)
	end += 5
	if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u':
	uni2 = _decode_uXXXX(s, end + 1)
	if 0xdc00 <= uni2 <= 0xdfff:
	uni = 0x10000 + (((uni - 0xd800) << 10) \| (uni2 - 0xdc00))
	end += 6
	char = chr(uni)
	_append(char)
	return ''.join(chunks), end


	# Use speedup if available
	scanstring = c_scanstring or py_scanstring

	WHITESPACE = re.compile(r'([ \t\n\r]\|//.?\n)', FLAGS)
	WHITESPACE_STR = ' \t\n\r/'


	def FlexibleJSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook,
	memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
	s, end = s_and_end
	pairs = []
	pairs_append = pairs.append
	# Backwards compatibility
	if memo is None:
	memo = {}
	memo_get = memo.setdefault
	# Use a slice to prevent IndexError from being raised, the following
	# check will raise a more specific ValueError if the string is empty
	nextchar = s[end:end + 1]
	# Normally we expect nextchar == '"'
	if nextchar != '"':
	if nextchar in _ws:
	end = _w(s, end).end()
	nextchar = s[end:end + 1]
	# Trivial empty object
	if nextchar == '}':
	if object_pairs_hook is not None:
	result = object_pairs_hook(pairs)
	return result, end + 1
	pairs = {}
	if object_hook is not None:
	pairs = object_hook(pairs)
	return pairs, end + 1
	elif nextchar != '"':
	raise JSONDecodeError(
	"Expecting property name enclosed in double quotes", s, end)
	end += 1
	while True:
	key, end = scanstring(s, end, strict)
	key = memo_get(key, key)
	# To skip some function call overhead we optimize the fast paths where
	# the JSON key separator is ": " or just ":".
	if s[end:end + 1] != ':':
	end = _w(s, end).end()
	if s[end:end + 1] != ':':
	raise JSONDecodeError("Expecting ':' delimiter", s, end)
	end += 1

	try:
	if s[end] in _ws:
	end = _w(s, end + 1).end()
	except IndexError:
	pass

	try:
	value, end = scan_once(s, end)
	except StopIteration as err:
	raise JSONDecodeError("Expecting value", s, err.value) from None
	pairs_append((key, value))
	try:
	nextchar = s[end]
	if nextchar in _ws:
	end = _w(s, end + 1).end()
	nextchar = s[end]
	except IndexError:
	nextchar = ''
	end += 1

	if nextchar == '}':
	break
	elif nextchar != ',':
	raise JSONDecodeError("Expecting ',' delimiter", s, end - 1)
	end = _w(s, end).end()
	nextchar = s[end:end + 1]
	end += 1
	if nextchar == '}':
	break
	elif nextchar != '"':
	raise JSONDecodeError(
	"Expecting property name enclosed in double quotes", s, end - 1)
	if object_pairs_hook is not None:
	result = object_pairs_hook(pairs)
	return result, end
	pairs = dict(pairs)
	if object_hook is not None:
	pairs = object_hook(pairs)
	return pairs, end


	def FlexibleJSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR):
	s, end = s_and_end
	values = []
	nextchar = s[end:end + 1]
	if nextchar in _ws:
	end = _w(s, end + 1).end()
	nextchar = s[end:end + 1]
	# Look-ahead for trivial empty array
	if nextchar == ']':
	return values, end + 1
	_append = values.append
	while True:
	try:
	value, end = scan_once(s, end)
	except StopIteration as err:
	if s[err.value] == ']':
	return values, err.value + 1
	raise JSONDecodeError("Expecting value", s, err.value) from None
	_append(value)
	nextchar = s[end:end + 1]
	if nextchar in _ws:
	end = _w(s, end + 1).end()
	nextchar = s[end:end + 1]
	end += 1
	if nextchar == ']':
	break
	elif nextchar != ',':
	raise JSONDecodeError("Expecting ',' delimiter", s, end - 1)
	try:
	if s[end] in _ws:
	end = _w(s, end + 1).end()
	except IndexError:
	pass

	return values, end


	class FlexibleJSONDecoder(json.JSONDecoder):
	"""JSON decoder. It allows for trailing commas."""

	def __init__(self):
	super().__init__()
	self.parse_object = FlexibleJSONObject
	self.parse_array = FlexibleJSONArray
	self.scan_once = scanner.py_make_scanner(self)

	def decode(self, s, _w=WHITESPACE.match):
	"""Return the Python representation of ``s`` (a ``str`` instance
	containing a JSON document).

	"""
	obj, end = self.raw_decode(s, idx=_w(s, 0).end())
	end = _w(s, end).end()
	if end != len(s):
	raise JSONDecodeError("Extra data", s, end)
	return obj

	def raw_decode(self, s, idx=0):
	"""Decode a JSON document from ``s`` (a ``str`` beginning with
	a JSON document) and return a 2-tuple of the Python
	representation and the index in ``s`` where the document ended.

	This can be used to decode a JSON document from a string that may
	have extraneous data at the end.

	"""
	try:
	obj, end = self.scan_once(s, idx)
	except StopIteration as err:
	raise JSONDecodeError("Expecting value", s, err.value) from None
	return obj, end


	if __name__ == '__main__':
	def test(raw, expected):
	decoded = json.loads(raw, cls=FlexibleJSONDecoder)
	assert decoded == expected

	test('{"foo": "bar"}', {"foo":"bar"})
	test('{"foo": "bar", }', {"foo":"bar"})
	test('["foo", "bar"]', ["foo", "bar"])
	test('["foo", "bar", ]', ["foo", "bar"])
	test('["foo", {"bar": "baz"}]', ["foo", {"bar": "baz"}])
	test('["foo", {"bar": "baz",},]', ["foo", {"bar": "baz"}])

	test('''[
	"foo",
	// Note the comment and the trailing comma
	"bar",
	]''', ["foo", "bar"])