Created
April 1, 2019 16:18
-
-
Save gwenzek/800e29521a0fa926aec0cd9db60ce6b6 to your computer and use it in GitHub Desktop.
Flexible Json decoder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Implementation of a flexible JSON decoder | |
import json | |
from json_decoder import FlexibleJSONDecoder | |
j = json.loads(raw, cls=FlexibleJSONDecoder) | |
""" | |
import json | |
import re | |
from json import scanner, JSONDecodeError | |
try: | |
from _json import scanstring as c_scanstring | |
except ImportError: | |
c_scanstring = None | |
FLAGS = re.VERBOSE | re.MULTILINE | re.DOTALL | |
NaN = float('nan') | |
PosInf = float('inf') | |
NegInf = float('-inf') | |
_CONSTANTS = { | |
'-Infinity': NegInf, | |
'Infinity': PosInf, | |
'NaN': NaN, | |
} | |
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', FLAGS) | |
BACKSLASH = { | |
'"': '"', '\\': '\\', '/': '/', | |
'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', | |
} | |
def _decode_uXXXX(s, pos): | |
esc = s[pos + 1:pos + 5] | |
if len(esc) == 4 and esc[1] not in 'xX': | |
try: | |
return int(esc, 16) | |
except ValueError: | |
pass | |
msg = "Invalid \\uXXXX escape" | |
raise JSONDecodeError(msg, s, pos) | |
def py_scanstring(s, end, strict=True, | |
_b=BACKSLASH, _m=STRINGCHUNK.match): | |
"""Scan the string s for a JSON string. End is the index of the | |
character in s after the quote that started the JSON string. | |
Unescapes all valid JSON string escape sequences and raises ValueError | |
on attempt to decode an invalid string. If strict is False then literal | |
control characters are allowed in the string. | |
Returns a tuple of the decoded string and the index of the character in s | |
after the end quote.""" | |
chunks = [] | |
_append = chunks.append | |
begin = end - 1 | |
while 1: | |
chunk = _m(s, end) | |
if chunk is None: | |
raise JSONDecodeError("Unterminated string starting at", s, begin) | |
end = chunk.end() | |
content, terminator = chunk.groups() | |
# Content is contains zero or more unescaped string characters | |
if content: | |
_append(content) | |
# Terminator is the end of string, a literal control character, | |
# or a backslash denoting that an escape sequence follows | |
if terminator == '"': | |
break | |
elif terminator != '\\': | |
if strict: | |
#msg = "Invalid control character %r at" % (terminator,) | |
msg = "Invalid control character {0!r} at".format(terminator) | |
raise JSONDecodeError(msg, s, end) | |
else: | |
_append(terminator) | |
continue | |
try: | |
esc = s[end] | |
except IndexError: | |
raise JSONDecodeError("Unterminated string starting at", | |
s, begin) from None | |
# If not a unicode escape sequence, must be in the lookup table | |
if esc != 'u': | |
try: | |
char = _b[esc] | |
except KeyError: | |
msg = "Invalid \\escape: {0!r}".format(esc) | |
raise JSONDecodeError(msg, s, end) | |
end += 1 | |
else: | |
uni = _decode_uXXXX(s, end) | |
end += 5 | |
if 0xd800 <= uni <= 0xdbff and s[end:end + 2] == '\\u': | |
uni2 = _decode_uXXXX(s, end + 1) | |
if 0xdc00 <= uni2 <= 0xdfff: | |
uni = 0x10000 + (((uni - 0xd800) << 10) | (uni2 - 0xdc00)) | |
end += 6 | |
char = chr(uni) | |
_append(char) | |
return ''.join(chunks), end | |
# Use speedup if available | |
scanstring = c_scanstring or py_scanstring | |
WHITESPACE = re.compile(r'([ \t\n\r]|//.*?\n)*', FLAGS) | |
WHITESPACE_STR = ' \t\n\r/' | |
def FlexibleJSONObject(s_and_end, strict, scan_once, object_hook, object_pairs_hook, | |
memo=None, _w=WHITESPACE.match, _ws=WHITESPACE_STR): | |
s, end = s_and_end | |
pairs = [] | |
pairs_append = pairs.append | |
# Backwards compatibility | |
if memo is None: | |
memo = {} | |
memo_get = memo.setdefault | |
# Use a slice to prevent IndexError from being raised, the following | |
# check will raise a more specific ValueError if the string is empty | |
nextchar = s[end:end + 1] | |
# Normally we expect nextchar == '"' | |
if nextchar != '"': | |
if nextchar in _ws: | |
end = _w(s, end).end() | |
nextchar = s[end:end + 1] | |
# Trivial empty object | |
if nextchar == '}': | |
if object_pairs_hook is not None: | |
result = object_pairs_hook(pairs) | |
return result, end + 1 | |
pairs = {} | |
if object_hook is not None: | |
pairs = object_hook(pairs) | |
return pairs, end + 1 | |
elif nextchar != '"': | |
raise JSONDecodeError( | |
"Expecting property name enclosed in double quotes", s, end) | |
end += 1 | |
while True: | |
key, end = scanstring(s, end, strict) | |
key = memo_get(key, key) | |
# To skip some function call overhead we optimize the fast paths where | |
# the JSON key separator is ": " or just ":". | |
if s[end:end + 1] != ':': | |
end = _w(s, end).end() | |
if s[end:end + 1] != ':': | |
raise JSONDecodeError("Expecting ':' delimiter", s, end) | |
end += 1 | |
try: | |
if s[end] in _ws: | |
end = _w(s, end + 1).end() | |
except IndexError: | |
pass | |
try: | |
value, end = scan_once(s, end) | |
except StopIteration as err: | |
raise JSONDecodeError("Expecting value", s, err.value) from None | |
pairs_append((key, value)) | |
try: | |
nextchar = s[end] | |
if nextchar in _ws: | |
end = _w(s, end + 1).end() | |
nextchar = s[end] | |
except IndexError: | |
nextchar = '' | |
end += 1 | |
if nextchar == '}': | |
break | |
elif nextchar != ',': | |
raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) | |
end = _w(s, end).end() | |
nextchar = s[end:end + 1] | |
end += 1 | |
if nextchar == '}': | |
break | |
elif nextchar != '"': | |
raise JSONDecodeError( | |
"Expecting property name enclosed in double quotes", s, end - 1) | |
if object_pairs_hook is not None: | |
result = object_pairs_hook(pairs) | |
return result, end | |
pairs = dict(pairs) | |
if object_hook is not None: | |
pairs = object_hook(pairs) | |
return pairs, end | |
def FlexibleJSONArray(s_and_end, scan_once, _w=WHITESPACE.match, _ws=WHITESPACE_STR): | |
s, end = s_and_end | |
values = [] | |
nextchar = s[end:end + 1] | |
if nextchar in _ws: | |
end = _w(s, end + 1).end() | |
nextchar = s[end:end + 1] | |
# Look-ahead for trivial empty array | |
if nextchar == ']': | |
return values, end + 1 | |
_append = values.append | |
while True: | |
try: | |
value, end = scan_once(s, end) | |
except StopIteration as err: | |
if s[err.value] == ']': | |
return values, err.value + 1 | |
raise JSONDecodeError("Expecting value", s, err.value) from None | |
_append(value) | |
nextchar = s[end:end + 1] | |
if nextchar in _ws: | |
end = _w(s, end + 1).end() | |
nextchar = s[end:end + 1] | |
end += 1 | |
if nextchar == ']': | |
break | |
elif nextchar != ',': | |
raise JSONDecodeError("Expecting ',' delimiter", s, end - 1) | |
try: | |
if s[end] in _ws: | |
end = _w(s, end + 1).end() | |
except IndexError: | |
pass | |
return values, end | |
class FlexibleJSONDecoder(json.JSONDecoder): | |
"""JSON decoder. It allows for trailing commas.""" | |
def __init__(self): | |
super().__init__() | |
self.parse_object = FlexibleJSONObject | |
self.parse_array = FlexibleJSONArray | |
self.scan_once = scanner.py_make_scanner(self) | |
def decode(self, s, _w=WHITESPACE.match): | |
"""Return the Python representation of ``s`` (a ``str`` instance | |
containing a JSON document). | |
""" | |
obj, end = self.raw_decode(s, idx=_w(s, 0).end()) | |
end = _w(s, end).end() | |
if end != len(s): | |
raise JSONDecodeError("Extra data", s, end) | |
return obj | |
def raw_decode(self, s, idx=0): | |
"""Decode a JSON document from ``s`` (a ``str`` beginning with | |
a JSON document) and return a 2-tuple of the Python | |
representation and the index in ``s`` where the document ended. | |
This can be used to decode a JSON document from a string that may | |
have extraneous data at the end. | |
""" | |
try: | |
obj, end = self.scan_once(s, idx) | |
except StopIteration as err: | |
raise JSONDecodeError("Expecting value", s, err.value) from None | |
return obj, end | |
if __name__ == '__main__': | |
def test(raw, expected): | |
decoded = json.loads(raw, cls=FlexibleJSONDecoder) | |
assert decoded == expected | |
test('{"foo": "bar"}', {"foo":"bar"}) | |
test('{"foo": "bar", }', {"foo":"bar"}) | |
test('["foo", "bar"]', ["foo", "bar"]) | |
test('["foo", "bar", ]', ["foo", "bar"]) | |
test('["foo", {"bar": "baz"}]', ["foo", {"bar": "baz"}]) | |
test('["foo", {"bar": "baz",},]', ["foo", {"bar": "baz"}]) | |
test('''[ | |
"foo", | |
// Note the comment and the trailing comma | |
"bar", | |
]''', ["foo", "bar"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment