#!/usr/bin/env python # -*- coding: utf-8 -*- # r""" A JSON data encoder and decoder. This Python module implements the JSON (http://json.org/) data encoding format; a subset of ECMAScript (aka JavaScript) for encoding primitive data types (numbers, strings, booleans, lists, and associative arrays) in a language-neutral simple text-based syntax. It can encode or decode between JSON formatted strings and native Python data types. Normally you would use the encode() and decode() functions defined by this module, but if you want more control over the processing you can use the JSON class. This implementation tries to be as completely cormforming to all intricacies of the standards as possible. It can operate in strict mode (which only allows JSON-compliant syntax) or a non-strict mode (which allows much more of the whole ECMAScript permitted syntax). This includes complete support for Unicode strings (including surrogate-pairs for non-BMP characters), and all number formats including negative zero and IEEE 754 non-numbers such a NaN or Infinity. The JSON/ECMAScript to Python type mappings are: ---JSON--- ---Python--- null None undefined undefined (note 1) Boolean (true,false) bool (True or False) Integer int or long (note 2) Float float String str or unicode ( "..." or u"..." ) Array [a, ...] list ( [...] ) Object {a:b, ...} dict ( {...} ) -- Note 1. an 'undefined' object is declared in this module which represents the native Python value for this type when in non-strict mode. -- Note 2. some ECMAScript integers may be up-converted to Python floats, such as 1e+40. Also integer -0 is converted to float -0, so as to preserve the sign (which ECMAScript requires). -- Note 3. numbers requiring more significant digits than can be represented by the Python float type will be converted into a Python Decimal type, from the standard 'decimal' module. In addition, when operating in non-strict mode, several IEEE 754 non-numbers are also handled, and are mapped to specific Python objects declared in this module: NaN (not a number) nan (float('nan')) Infinity, +Infinity inf (float('inf')) -Infinity neginf (float('-inf')) When encoding Python objects into JSON, you may use types other than native lists or dictionaries, as long as they support the minimal interfaces required of all sequences or mappings. This means you can use generators and iterators, tuples, UserDict subclasses, etc. To make it easier to produce JSON encoded representations of user defined classes, if the object has a method named json_equivalent(), then it will call that method and attempt to encode the object returned from it instead. It will do this recursively as needed and before any attempt to encode the object using it's default strategies. Note that any json_equivalent() method should return "equivalent" Python objects to be encoded, not an already-encoded JSON-formatted string. There is no such aid provided to decode JSON back into user-defined classes as that would dramatically complicate the interface. When decoding strings with this module it may operate in either strict or non-strict mode. The strict mode only allows syntax which is conforming to RFC 7159 (JSON), while the non-strict allows much more of the permissible ECMAScript syntax. The following are permitted when processing in NON-STRICT mode: * Unicode format control characters are allowed anywhere in the input. * All Unicode line terminator characters are recognized. * All Unicode white space characters are recognized. * The 'undefined' keyword is recognized. * Hexadecimal number literals are recognized (e.g., 0xA6, 0177). * String literals may use either single or double quote marks. * Strings may contain \x (hexadecimal) escape sequences, as well as the \v and \0 escape sequences. * Lists may have omitted (elided) elements, e.g., [,,,,,], with missing elements interpreted as 'undefined' values. * Object properties (dictionary keys) can be of any of the types: string literals, numbers, or identifiers (the later of which are treated as if they are string literals)---as permitted by ECMAScript. JSON only permits strings literals as keys. Concerning non-strict and non-ECMAScript allowances: * Octal numbers: If you allow the 'octal_numbers' behavior (which is never enabled by default), then you can use octal integers and octal character escape sequences (per the ECMAScript standard Annex B.1.2). This behavior is allowed, if enabled, because it was valid JavaScript at one time. * Multi-line string literals: Strings which are more than one line long (contain embedded raw newline characters) are never permitted. This is neither valid JSON nor ECMAScript. Some other JSON implementations may allow this, but this module considers that behavior to be a mistake. References: * JSON (JavaScript Object Notation) * RFC 7159. The application/json Media Type for JavaScript Object Notation (JSON) * ECMA-262 3rd edition (1999) * IEEE 754-1985: Standard for Binary Floating-Point Arithmetic. """ __author__ = "Deron Meranda " __homepage__ = "http://deron.meranda.us/python/demjson/" __date__ = "2015-12-22" __version__ = "2.2.4" __version_info__ = ( 2, 2, 4 ) # Will be converted into a namedtuple below __credits__ = """Copyright (c) 2006-2015 Deron E. Meranda Licensed under GNU LGPL (GNU Lesser General Public License) version 3.0 or later. See LICENSE.txt included with this software. This program is free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this program. If not, see or . """ # ---------------------------------------------------------------------- # Set demjson version try: from collections import namedtuple as _namedtuple __version_info__ = _namedtuple('version_info', ['major', 'minor', 'micro'])( *__version_info__ ) except ImportError: raise ImportError("demjson %s requires a Python 2.6 or later" % __version__ ) version, version_info = __version__, __version_info__ # Determine Python version _py_major, _py_minor = None, None def _get_pyver(): global _py_major, _py_minor import sys vi = sys.version_info try: _py_major, _py_minor = vi.major, vi.minor except AttributeError: _py_major, _py_minor = vi[0], vi[1] _get_pyver() # ---------------------------------------------------------------------- # Useful global constants content_type = 'application/json' file_ext = 'json' class _dummy_context_manager(object): """A context manager that does nothing on entry or exit.""" def __enter__(self): pass def __exit__(self, exc_type, exc_val, exc_tb): return False _dummy_context_manager = _dummy_context_manager() # ---------------------------------------------------------------------- # Decimal and float types. # # If a JSON number can not be stored in a Python float without loosing # precision and the Python has the decimal type, then we will try to # use decimal instead of float. To make this determination we need to # know the limits of the float type, but Python doesn't have an easy # way to tell what the largest floating-point number it supports. So, # we detemine the precision and scale of the float type by testing it. try: # decimal module was introduced in Python 2.4 import decimal except ImportError: decimal = None def determine_float_limits( number_type=float ): """Determines the precision and range of the given float type. The passed in 'number_type' argument should refer to the type of floating-point number. It should either be the built-in 'float', or decimal context or constructor; i.e., one of: # 1. FLOAT TYPE determine_float_limits( float ) # 2. DEFAULT DECIMAL CONTEXT determine_float_limits( decimal.Decimal ) # 3. CUSTOM DECIMAL CONTEXT ctx = decimal.Context( prec=75 ) determine_float_limits( ctx ) Returns a named tuple with components: ( significant_digits, max_exponent, min_exponent ) Where: * significant_digits -- maximum number of *decimal* digits that can be represented without any loss of precision. This is conservative, so if there are 16 1/2 digits, it will return 16, not 17. * max_exponent -- The maximum exponent (power of 10) that can be represented before an overflow (or rounding to infinity) occurs. * min_exponent -- The minimum exponent (negative power of 10) that can be represented before either an underflow (rounding to zero) or a subnormal result (loss of precision) occurs. Note this is conservative, as subnormal numbers are excluded. """ if decimal: numeric_exceptions = (ValueError,decimal.Overflow,decimal.Underflow) else: numeric_exceptions = (ValueError,) if decimal and number_type == decimal.Decimal: number_type = decimal.DefaultContext if decimal and isinstance(number_type, decimal.Context): # Passed a decimal Context, extract the bound creator function. create_num = number_type.create_decimal decimal_ctx = decimal.localcontext(number_type) is_zero_or_subnormal = lambda n: n.is_zero() or n.is_subnormal() elif number_type == float: create_num = number_type decimal_ctx = _dummy_context_manager is_zero_or_subnormal = lambda n: n==0 else: raise TypeError("Expected a float type, e.g., float or decimal context") with decimal_ctx: zero = create_num('0.0') # Find signifianct digits by comparing floats of increasing # number of digits, differing in the last digit only, until # they numerically compare as being equal. sigdigits = None n = 0 while True: n = n + 1 pfx = '0.' + '1'*n a = create_num( pfx + '0') for sfx in '123456789': # Check all possible last digits to # avoid any partial-decimal. b = create_num( pfx + sfx ) if (a+zero) == (b+zero): sigdigits = n break if sigdigits: break # Find exponent limits. First find order of magnitude and # then use a binary search to find the exact exponent. base = '1.' + '1'*(sigdigits-1) base0 = '1.' + '1'*(sigdigits-2) minexp, maxexp = None, None for expsign in ('+','-'): minv = 0; maxv = 10 # First find order of magnitude of exponent limit while True: try: s = base + 'e' + expsign + str(maxv) s0 = base0 + 'e' + expsign + str(maxv) f = create_num( s ) + zero f0 = create_num( s0 ) + zero except numeric_exceptions: f = None if not f or not str(f)[0].isdigit() or is_zero_or_subnormal(f) or f==f0: break else: minv = maxv maxv = maxv * 10 # Now do a binary search to find exact limit while True: if minv+1 == maxv: if expsign=='+': maxexp = minv else: minexp = minv break elif maxv < minv: if expsign=='+': maxexp = None else: minexp = None break m = (minv + maxv) // 2 try: s = base + 'e' + expsign + str(m) s0 = base0 + 'e' + expsign + str(m) f = create_num( s ) + zero f0 = create_num( s0 ) + zero except numeric_exceptions: f = None else: if not f or not str(f)[0].isdigit(): f = None elif is_zero_or_subnormal(f) or f==f0: f = None if not f: # infinite maxv = m else: minv = m return _namedtuple('float_limits', ['significant_digits', 'max_exponent', 'min_exponent'])( sigdigits, maxexp, -minexp ) float_sigdigits, float_maxexp, float_minexp = determine_float_limits( float ) # For backwards compatibility with older demjson versions: def determine_float_precision(): v = determine_float_limits( float ) return ( v.significant_digits, v.max_exponent ) # ---------------------------------------------------------------------- # The undefined value. # # ECMAScript has an undefined value (similar to yet distinct from null). # Neither Python or strict JSON have support undefined, but to allow # JavaScript behavior we must simulate it. class _undefined_class(object): """Represents the ECMAScript 'undefined' value.""" __slots__ = [] def __repr__(self): return self.__module__ + '.undefined' def __str__(self): return 'undefined' def __nonzero__(self): return False undefined = _undefined_class() syntax_error = _undefined_class() # same as undefined, but has separate identity del _undefined_class # ---------------------------------------------------------------------- # Non-Numbers: NaN, Infinity, -Infinity # # ECMAScript has official support for non-number floats, although # strict JSON does not. Python doesn't either. So to support the # full JavaScript behavior we must try to add them into Python, which # is unfortunately a bit of black magic. If our python implementation # happens to be built on top of IEEE 754 we can probably trick python # into using real floats. Otherwise we must simulate it with classes. def _nonnumber_float_constants(): """Try to return the Nan, Infinity, and -Infinity float values. This is necessarily complex because there is no standard platform-independent way to do this in Python as the language (opposed to some implementation of it) doesn't discuss non-numbers. We try various strategies from the best to the worst. If this Python interpreter uses the IEEE 754 floating point standard then the returned values will probably be real instances of the 'float' type. Otherwise a custom class object is returned which will attempt to simulate the correct behavior as much as possible. """ try: # First, try (mostly portable) float constructor. Works under # Linux x86 (gcc) and some Unices. nan = float('nan') inf = float('inf') neginf = float('-inf') except ValueError: try: # Try the AIX (PowerPC) float constructors nan = float('NaNQ') inf = float('INF') neginf = float('-INF') except ValueError: try: # Next, try binary unpacking. Should work under # platforms using IEEE 754 floating point. import struct, sys xnan = '7ff8000000000000'.decode('hex') # Quiet NaN xinf = '7ff0000000000000'.decode('hex') xcheck = 'bdc145651592979d'.decode('hex') # -3.14159e-11 # Could use float.__getformat__, but it is a new python feature, # so we use sys.byteorder. if sys.byteorder == 'big': nan = struct.unpack('d', xnan)[0] inf = struct.unpack('d', xinf)[0] check = struct.unpack('d', xcheck)[0] else: nan = struct.unpack('d', xnan[::-1])[0] inf = struct.unpack('d', xinf[::-1])[0] check = struct.unpack('d', xcheck[::-1])[0] neginf = - inf if check != -3.14159e-11: raise ValueError('Unpacking raw IEEE 754 floats does not work') except (ValueError, TypeError): # Punt, make some fake classes to simulate. These are # not perfect though. For instance nan * 1.0 == nan, # as expected, but 1.0 * nan == 0.0, which is wrong. class nan(float): """An approximation of the NaN (not a number) floating point number.""" def __repr__(self): return 'nan' def __str__(self): return 'nan' def __add__(self,x): return self def __radd__(self,x): return self def __sub__(self,x): return self def __rsub__(self,x): return self def __mul__(self,x): return self def __rmul__(self,x): return self def __div__(self,x): return self def __rdiv__(self,x): return self def __divmod__(self,x): return (self,self) def __rdivmod__(self,x): return (self,self) def __mod__(self,x): return self def __rmod__(self,x): return self def __pow__(self,exp): return self def __rpow__(self,exp): return self def __neg__(self): return self def __pos__(self): return self def __abs__(self): return self def __lt__(self,x): return False def __le__(self,x): return False def __eq__(self,x): return False def __neq__(self,x): return True def __ge__(self,x): return False def __gt__(self,x): return False def __complex__(self,*a): raise NotImplementedError('NaN can not be converted to a complex') if decimal: nan = decimal.Decimal('NaN') else: nan = nan() class inf(float): """An approximation of the +Infinity floating point number.""" def __repr__(self): return 'inf' def __str__(self): return 'inf' def __add__(self,x): return self def __radd__(self,x): return self def __sub__(self,x): return self def __rsub__(self,x): return self def __mul__(self,x): if x is neginf or x < 0: return neginf elif x == 0: return nan else: return self def __rmul__(self,x): return self.__mul__(x) def __div__(self,x): if x == 0: raise ZeroDivisionError('float division') elif x < 0: return neginf else: return self def __rdiv__(self,x): if x is inf or x is neginf or x is nan: return nan return 0.0 def __divmod__(self,x): if x == 0: raise ZeroDivisionError('float divmod()') elif x < 0: return (nan,nan) else: return (self,self) def __rdivmod__(self,x): if x is inf or x is neginf or x is nan: return (nan, nan) return (0.0, x) def __mod__(self,x): if x == 0: raise ZeroDivisionError('float modulo') else: return nan def __rmod__(self,x): if x is inf or x is neginf or x is nan: return nan return x def __pow__(self, exp): if exp == 0: return 1.0 else: return self def __rpow__(self, x): if -1 < x < 1: return 0.0 elif x == 1.0: return 1.0 elif x is nan or x is neginf or x < 0: return nan else: return self def __neg__(self): return neginf def __pos__(self): return self def __abs__(self): return self def __lt__(self,x): return False def __le__(self,x): if x is self: return True else: return False def __eq__(self,x): if x is self: return True else: return False def __neq__(self,x): if x is self: return False else: return True def __ge__(self,x): return True def __gt__(self,x): return True def __complex__(self,*a): raise NotImplementedError('Infinity can not be converted to a complex') if decimal: inf = decimal.Decimal('Infinity') else: inf = inf() class neginf(float): """An approximation of the -Infinity floating point number.""" def __repr__(self): return '-inf' def __str__(self): return '-inf' def __add__(self,x): return self def __radd__(self,x): return self def __sub__(self,x): return self def __rsub__(self,x): return self def __mul__(self,x): if x is self or x < 0: return inf elif x == 0: return nan else: return self def __rmul__(self,x): return self.__mul__(self) def __div__(self,x): if x == 0: raise ZeroDivisionError('float division') elif x < 0: return inf else: return self def __rdiv__(self,x): if x is inf or x is neginf or x is nan: return nan return -0.0 def __divmod__(self,x): if x == 0: raise ZeroDivisionError('float divmod()') elif x < 0: return (nan,nan) else: return (self,self) def __rdivmod__(self,x): if x is inf or x is neginf or x is nan: return (nan, nan) return (-0.0, x) def __mod__(self,x): if x == 0: raise ZeroDivisionError('float modulo') else: return nan def __rmod__(self,x): if x is inf or x is neginf or x is nan: return nan return x def __pow__(self,exp): if exp == 0: return 1.0 else: return self def __rpow__(self, x): if x is nan or x is inf or x is inf: return nan return 0.0 def __neg__(self): return inf def __pos__(self): return self def __abs__(self): return inf def __lt__(self,x): return True def __le__(self,x): return True def __eq__(self,x): if x is self: return True else: return False def __neq__(self,x): if x is self: return False else: return True def __ge__(self,x): if x is self: return True else: return False def __gt__(self,x): return False def __complex__(self,*a): raise NotImplementedError('-Infinity can not be converted to a complex') if decimal: neginf = decimal.Decimal('-Infinity') else: neginf = neginf(0) return nan, inf, neginf nan, inf, neginf = _nonnumber_float_constants() del _nonnumber_float_constants # ---------------------------------------------------------------------- # Integers class json_int( (1L).__class__ ): # Have to specify base this way to satisfy 2to3 """A subclass of the Python int/long that remembers its format (hex,octal,etc). Initialize it the same as an int, but also accepts an additional keyword argument 'number_format' which should be one of the NUMBER_FORMAT_* values. n = json_int( x[, base, number_format=NUMBER_FORMAT_DECIMAL] ) """ def __new__(cls, *args, **kwargs): if 'number_format' in kwargs: number_format = kwargs['number_format'] del kwargs['number_format'] if number_format not in (NUMBER_FORMAT_DECIMAL, NUMBER_FORMAT_HEX, NUMBER_FORMAT_OCTAL, NUMBER_FORMAT_LEGACYOCTAL, NUMBER_FORMAT_BINARY): raise TypeError("json_int(): Invalid value for number_format argument") else: number_format = NUMBER_FORMAT_DECIMAL obj = super(json_int,cls).__new__(cls,*args,**kwargs) obj._jsonfmt = number_format return obj @property def number_format(self): """The original radix format of the number""" return self._jsonfmt def json_format(self): """Returns the integer value formatted as a JSON literal""" fmt = self._jsonfmt if fmt == NUMBER_FORMAT_HEX: return format(self, '#x') elif fmt == NUMBER_FORMAT_OCTAL: return format(self, '#o') elif fmt == NUMBER_FORMAT_BINARY: return format(self, '#b') elif fmt == NUMBER_FORMAT_LEGACYOCTAL: if self==0: return '0' # For some reason Python's int doesn't do '00' elif self < 0: return '-0%o' % (-self) else: return '0%o' % self else: return str(self) # ---------------------------------------------------------------------- # String processing helpers def skipstringsafe( s, start=0, end=None ): i = start #if end is None: # end = len(s) unsafe = helpers.unsafe_string_chars while i < end and s[i] not in unsafe: #c = s[i] #if c in unsafe_string_chars: # break i += 1 return i def skipstringsafe_slow( s, start=0, end=None ): i = start if end is None: end = len(s) while i < end: c = s[i] if c == '"' or c == "'" or c == '\\' or ord(c) <= 0x1f: break i += 1 return i def extend_list_with_sep( orig_seq, extension_seq, sepchar='' ): if not sepchar: orig_seq.extend( extension_seq ) else: for i, x in enumerate(extension_seq): if i > 0: orig_seq.append( sepchar ) orig_seq.append( x ) def extend_and_flatten_list_with_sep( orig_seq, extension_seq, separator='' ): for i, part in enumerate(extension_seq): if i > 0 and separator: orig_seq.append( separator ) orig_seq.extend( part ) # ---------------------------------------------------------------------- # Unicode UTF-32 # ---------------------------------------------------------------------- def _make_raw_bytes( byte_list ): """Takes a list of byte values (numbers) and returns a bytes (Python 3) or string (Python 2) """ if _py_major >= 3: b = bytes( byte_list ) else: b = ''.join(chr(n) for n in byte_list) return b import codecs class utf32(codecs.CodecInfo): """Unicode UTF-32 and UCS4 encoding/decoding support. This is for older Pythons whch did not have UTF-32 codecs. JSON requires that all JSON implementations must support the UTF-32 encoding (as well as UTF-8 and UTF-16). But earlier versions of Python did not provide a UTF-32 codec, so we must implement UTF-32 ourselves in case we need it. See http://en.wikipedia.org/wiki/UTF-32 """ BOM_UTF32_BE = _make_raw_bytes([ 0, 0, 0xFE, 0xFF ]) #'\x00\x00\xfe\xff' BOM_UTF32_LE = _make_raw_bytes([ 0xFF, 0xFE, 0, 0 ]) #'\xff\xfe\x00\x00' @staticmethod def lookup( name ): """A standard Python codec lookup function for UCS4/UTF32. If if recognizes an encoding name it returns a CodecInfo structure which contains the various encode and decoder functions to use. """ ci = None name = name.upper() if name in ('UCS4BE','UCS-4BE','UCS-4-BE','UTF32BE','UTF-32BE','UTF-32-BE'): ci = codecs.CodecInfo( utf32.utf32be_encode, utf32.utf32be_decode, name='utf-32be') elif name in ('UCS4LE','UCS-4LE','UCS-4-LE','UTF32LE','UTF-32LE','UTF-32-LE'): ci = codecs.CodecInfo( utf32.utf32le_encode, utf32.utf32le_decode, name='utf-32le') elif name in ('UCS4','UCS-4','UTF32','UTF-32'): ci = codecs.CodecInfo( utf32.encode, utf32.decode, name='utf-32') return ci @staticmethod def encode( obj, errors='strict', endianness=None, include_bom=True ): """Encodes a Unicode string into a UTF-32 encoded byte string. Returns a tuple: (bytearray, num_chars) The errors argument should be one of 'strict', 'ignore', or 'replace'. The endianness should be one of: * 'B', '>', or 'big' -- Big endian * 'L', '<', or 'little' -- Little endien * None -- Default, from sys.byteorder If include_bom is true a Byte-Order Mark will be written to the beginning of the string, otherwise it will be omitted. """ import sys, struct # Make a container that can store bytes if _py_major >= 3: f = bytearray() write = f.extend def tobytes(): return bytes(f) else: try: import cStringIO as sio except ImportError: import StringIO as sio f = sio.StringIO() write = f.write tobytes = f.getvalue if not endianness: endianness = sys.byteorder if endianness.upper()[0] in ('B>'): big_endian = True elif endianness.upper()[0] in ('L<'): big_endian = False else: raise ValueError("Invalid endianness %r: expected 'big', 'little', or None" % endianness) pack = struct.pack packspec = '>L' if big_endian else ' maxunicode or (0xD800 <= n <= 0xDFFF): if errors == 'strict': raise UnicodeDecodeError('utf32',obj,i,i+4,'Invalid code point U+%04X' % n) elif errors == 'replace': chars.append( unichr(0xFFFD) ) elif errors == 'backslashreplace': if n > 0xffff: esc = "\\u%04x" % (n,) else: esc = "\\U%08x" % (n,) for esc_c in esc: chars.append( esc_c ) elif errors == 'xmlcharrefreplace': esc = "&#%d;" % (n,) for esc_c in esc: chars.append( esc_c ) else: # ignore pass else: chars.append( helpers.safe_unichr(n) ) return (u''.join( chars ), num_bytes) @staticmethod def utf32le_decode( obj, errors='strict' ): """Decodes a UTF-32LE (little endian) byte string into a Unicode string.""" return utf32.decode( obj, errors=errors, endianness='L' ) @staticmethod def utf32be_decode( obj, errors='strict' ): """Decodes a UTF-32BE (big endian) byte string into a Unicode string.""" return utf32.decode( obj, errors=errors, endianness='B' ) # ---------------------------------------------------------------------- # Helper functions # ---------------------------------------------------------------------- def _make_unsafe_string_chars(): import unicodedata unsafe = [] for c in [unichr(i) for i in range(0x100)]: if c == u'"' or c == u'\\' \ or unicodedata.category( c ) in ['Cc','Cf','Zl','Zp']: unsafe.append( c ) return u''.join( unsafe ) class helpers(object): """A set of utility functions.""" hexdigits = '0123456789ABCDEFabcdef' octaldigits = '01234567' unsafe_string_chars = _make_unsafe_string_chars() import sys maxunicode = sys.maxunicode always_use_custom_codecs = False # If True use demjson's codecs # before system codecs. This # is mainly here for testing. javascript_reserved_words = frozenset([ # Keywords (plus "let") (ECMAScript 6 section 11.6.2.1) 'break','case','catch','class','const','continue', 'debugger','default','delete','do','else','export', 'extends','finally','for','function','if','import', 'in','instanceof','let','new','return','super', 'switch','this','throw','try','typeof','var','void', 'while','with','yield', # Future reserved words (ECMAScript 6 section 11.6.2.2) 'enum','implements','interface','package', 'private','protected','public','static', # null/boolean literals 'null','true','false' ]) @staticmethod def make_raw_bytes( byte_list ): """Constructs a byte array (bytes in Python 3, str in Python 2) from a list of byte values (0-255). """ return _make_raw_bytes( byte_list ) @staticmethod def is_hex_digit( c ): """Determines if the given character is a valid hexadecimal digit (0-9, a-f, A-F).""" return (c in helpers.hexdigits) @staticmethod def is_octal_digit( c ): """Determines if the given character is a valid octal digit (0-7).""" return (c in helpers.octaldigits) @staticmethod def is_binary_digit( c ): """Determines if the given character is a valid binary digit (0 or 1).""" return (c == '0' or c == '1') @staticmethod def char_is_json_ws( c ): """Determines if the given character is a JSON white-space character""" return c in ' \t\n\r' @staticmethod def safe_unichr( codepoint ): """Just like Python's unichr() but works in narrow-Unicode Pythons.""" if codepoint >= 0x10000 and codepoint > helpers.maxunicode: # Narrow-Unicode python, construct a UTF-16 surrogate pair. w1, w2 = helpers.make_surrogate_pair( codepoint ) if w2 is None: c = unichr(w1) else: c = unichr(w1) + unichr(w2) else: c = unichr(codepoint) return c @staticmethod def char_is_unicode_ws( c ): """Determines if the given character is a Unicode space character""" if not isinstance(c,unicode): c = unicode(c) if c in u' \t\n\r\f\v': return True import unicodedata return unicodedata.category(c) == 'Zs' @staticmethod def char_is_json_eol( c ): """Determines if the given character is a JSON line separator""" return c in '\n\r' @staticmethod def char_is_unicode_eol( c ): """Determines if the given character is a Unicode line or paragraph separator. These correspond to CR and LF as well as Unicode characters in the Zl or Zp categories. """ return c in u'\r\n\u2028\u2029' @staticmethod def char_is_identifier_leader( c ): """Determines if the character may be the first character of a JavaScript identifier. """ return c.isalpha() or c in '_$' @staticmethod def char_is_identifier_tail( c ): """Determines if the character may be part of a JavaScript identifier. """ return c.isalnum() or c in u'_$\u200c\u200d' @staticmethod def extend_and_flatten_list_with_sep( orig_seq, extension_seq, separator='' ): for i, part in enumerate(extension_seq): if i > 0 and separator: orig_seq.append( separator ) orig_seq.extend( part ) @staticmethod def strip_format_control_chars( txt ): """Filters out all Unicode format control characters from the string. ECMAScript permits any Unicode "format control characters" to appear at any place in the source code. They are to be ignored as if they are not there before any other lexical tokenization occurs. Note that JSON does not allow them, except within string literals. * Ref. ECMAScript section 7.1. * http://en.wikipedia.org/wiki/Unicode_control_characters There are dozens of Format Control Characters, for example: U+00AD SOFT HYPHEN U+200B ZERO WIDTH SPACE U+2060 WORD JOINER """ import unicodedata txt2 = filter( lambda c: unicodedata.category(unicode(c)) != 'Cf', txt ) # 2to3 NOTE: The following is needed to work around a broken # Python3 conversion in which filter() will be transformed # into a list rather than a string. if not isinstance(txt2,basestring): txt2 = u''.join(txt2) return txt2 @staticmethod def lookup_codec( encoding ): """Wrapper around codecs.lookup(). Returns None if codec not found, rather than raising a LookupError. """ import codecs if isinstance( encoding, codecs.CodecInfo ): return encoding encoding = encoding.lower() import codecs if helpers.always_use_custom_codecs: # Try custom utf32 first, then standard python codecs cdk = utf32.lookup(encoding) if not cdk: try: cdk = codecs.lookup( encoding ) except LookupError: cdk = None else: # Try standard python codecs first, then custom utf32 try: cdk = codecs.lookup( encoding ) except LookupError: cdk = utf32.lookup( encoding ) return cdk @staticmethod def auto_detect_encoding( s ): """Takes a string (or byte array) and tries to determine the Unicode encoding it is in. Returns the encoding name, as a string. """ if not s or len(s)==0: return "utf-8" # Get the byte values of up to the first 4 bytes ords = [] for i in range(0, min(len(s),4)): x = s[i] if isinstance(x, basestring): x = ord(x) ords.append( x ) # Look for BOM marker import sys, codecs bom2, bom3, bom4 = None, None, None if len(s) >= 2: bom2 = s[:2] if len(s) >= 3: bom3 = s[:3] if len(s) >= 4: bom4 = s[:4] # Assign values of first four bytes to: a, b, c, d; and last byte to: z a, b, c, d, z = None, None, None, None, None if len(s) >= 1: a = ords[0] if len(s) >= 2: b = ords[1] if len(s) >= 3: c = ords[2] if len(s) >= 4: d = ords[3] z = s[-1] if isinstance(z, basestring): z = ord(z) if bom4 and ( (hasattr(codecs,'BOM_UTF32_LE') and bom4 == codecs.BOM_UTF32_LE) or (bom4 == utf32.BOM_UTF32_LE) ): encoding = 'utf-32le' s = s[4:] elif bom4 and ( (hasattr(codecs,'BOM_UTF32_BE') and bom4 == codecs.BOM_UTF32_BE) or (bom4 == utf32.BOM_UTF32_BE) ): encoding = 'utf-32be' s = s[4:] elif bom2 and bom2 == codecs.BOM_UTF16_LE: encoding = 'utf-16le' s = s[2:] elif bom2 and bom2 == codecs.BOM_UTF16_BE: encoding = 'utf-16be' s = s[2:] elif bom3 and bom3 == codecs.BOM_UTF8: encoding = 'utf-8' s = s[3:] # No BOM, so autodetect encoding used by looking at first four # bytes according to RFC 4627 section 3. The first and last bytes # in a JSON document will be ASCII. The second byte will be ASCII # unless the first byte was a quotation mark. elif len(s)>=4 and a==0 and b==0 and c==0 and d!=0: # UTF-32BE (0 0 0 x) encoding = 'utf-32be' elif len(s)>=4 and a!=0 and b==0 and c==0 and d==0 and z==0: # UTF-32LE (x 0 0 0 [... 0]) encoding = 'utf-32le' elif len(s)>=2 and a==0 and b!=0: # UTF-16BE (0 x) encoding = 'utf-16be' elif len(s)>=2 and a!=0 and b==0 and z==0: # UTF-16LE (x 0 [... 0]) encoding = 'utf-16le' elif ord('\t') <= a <= 127: # First byte appears to be ASCII, so guess UTF-8. encoding = 'utf8' else: raise ValueError("Can not determine the Unicode encoding for byte stream") return encoding @staticmethod def unicode_decode( txt, encoding=None ): """Takes a string (or byte array) and tries to convert it to a Unicode string. Returns a named tuple: (string, codec, bom) The 'encoding' argument, if supplied, should either the name of a character encoding, or an instance of codecs.CodecInfo. If the encoding argument is None or "auto" then the encoding is automatically determined, if possible. Any BOM (Byte Order Mark) that is found at the beginning of the input will be stripped off and placed in the 'bom' portion of the returned value. """ if isinstance(txt, unicode): res = _namedtuple('DecodedString',['string','codec','bom'])( txt, None, None ) else: if encoding is None or encoding == 'auto': encoding = helpers.auto_detect_encoding( txt ) cdk = helpers.lookup_codec( encoding ) if not cdk: raise LookupError("Can not find codec for encoding %r" % encoding) try: # Determine if codec takes arguments; try a decode of nothing cdk.decode( helpers.make_raw_bytes([]), errors='strict' ) except TypeError: cdk_kw = {} # This coded doesn't like the errors argument else: cdk_kw = {'errors': 'strict'} unitxt, numbytes = cdk.decode( txt, **cdk_kw ) # DO THE DECODE HERE! # Remove BOM if present if len(unitxt) > 0 and unitxt[0] == u'\uFEFF': bom = cdk.encode(unitxt[0])[0] unitxt = unitxt[1:] elif len(unitxt) > 0 and unitxt[0] == u'\uFFFE': # Reversed BOM raise UnicodeDecodeError(cdk.name,txt,0,0,"Wrong byte order, found reversed BOM U+FFFE") else: bom = None res = _namedtuple('DecodedString',['string','codec','bom'])( unitxt, cdk, bom ) return res @staticmethod def surrogate_pair_as_unicode( c1, c2 ): """Takes a pair of unicode surrogates and returns the equivalent unicode character. The input pair must be a surrogate pair, with c1 in the range U+D800 to U+DBFF and c2 in the range U+DC00 to U+DFFF. """ n1, n2 = ord(c1), ord(c2) if n1 < 0xD800 or n1 > 0xDBFF or n2 < 0xDC00 or n2 > 0xDFFF: raise JSONDecodeError('illegal Unicode surrogate pair',(c1,c2)) a = n1 - 0xD800 b = n2 - 0xDC00 v = (a << 10) | b v += 0x10000 return helpers.safe_unichr(v) @staticmethod def unicode_as_surrogate_pair( c ): """Takes a single unicode character and returns a sequence of surrogate pairs. The output of this function is a tuple consisting of one or two unicode characters, such that if the input character is outside the BMP range then the output is a two-character surrogate pair representing that character. If the input character is inside the BMP then the output tuple will have just a single character...the same one. """ n = ord(c) w1, w2 = helpers.make_surrogate_pair(n) if w2 is None: return (unichr(w1),) else: return (unichr(w1), unichr(w2)) @staticmethod def make_surrogate_pair( codepoint ): """Given a Unicode codepoint (int) returns a 2-tuple of surrogate codepoints.""" if codepoint < 0x10000: return (codepoint,None) # in BMP, surrogate pair not required v = codepoint - 0x10000 vh = (v >> 10) & 0x3ff # highest 10 bits vl = v & 0x3ff # lowest 10 bits w1 = 0xD800 | vh w2 = 0xDC00 | vl return (w1, w2) @staticmethod def isnumbertype( obj ): """Is the object of a Python number type (excluding complex)?""" return isinstance(obj, (int,long,float)) \ and not isinstance(obj, bool) \ or obj is nan or obj is inf or obj is neginf \ or (decimal and isinstance(obj, decimal.Decimal)) @staticmethod def is_negzero( n ): """Is the number value a negative zero?""" if isinstance( n, float ): return n == 0.0 and repr(n).startswith('-') elif decimal and isinstance( n, decimal.Decimal ): return n.is_zero() and n.is_signed() else: return False @staticmethod def is_nan( n ): """Is the number a NaN (not-a-number)?""" if isinstance( n, float ): return n is nan or n.hex() == 'nan' or n != n elif decimal and isinstance( n, decimal.Decimal ): return n.is_nan() else: return False @staticmethod def is_infinite( n ): """Is the number infinite?""" if isinstance( n, float ): return n is inf or n is neginf or n.hex() in ('inf','-inf') elif decimal and isinstance( n, decimal.Decimal ): return n.is_infinite() else: return False @staticmethod def isstringtype( obj ): """Is the object of a Python string type?""" if isinstance(obj, basestring): return True # Must also check for some other pseudo-string types import types, UserString return isinstance(obj, types.StringTypes) \ or isinstance(obj, UserString.UserString) ## or isinstance(obj, UserString.MutableString) @staticmethod def decode_hex( hexstring ): """Decodes a hexadecimal string into it's integer value.""" # We don't use the builtin 'hex' codec in python since it can # not handle odd numbers of digits, nor raise the same type # of exceptions we want to. n = 0 for c in hexstring: if '0' <= c <= '9': d = ord(c) - ord('0') elif 'a' <= c <= 'f': d = ord(c) - ord('a') + 10 elif 'A' <= c <= 'F': d = ord(c) - ord('A') + 10 else: raise ValueError('Not a hexadecimal number', hexstring) # Could use ((n << 4 ) | d), but python 2.3 issues a FutureWarning. n = (n * 16) + d return n @staticmethod def decode_octal( octalstring ): """Decodes an octal string into it's integer value.""" n = 0 for c in octalstring: if '0' <= c <= '7': d = ord(c) - ord('0') else: raise ValueError('Not an octal number', octalstring) # Could use ((n << 3 ) | d), but python 2.3 issues a FutureWarning. n = (n * 8) + d return n @staticmethod def decode_binary( binarystring ): """Decodes a binary string into it's integer value.""" n = 0 for c in binarystring: if c == '0': d = 0 elif c == '1': d = 1 else: raise ValueError('Not an binary number', binarystring) # Could use ((n << 3 ) | d), but python 2.3 issues a FutureWarning. n = (n * 2) + d return n @staticmethod def format_timedelta_iso( td ): """Encodes a datetime.timedelta into ISO-8601 Time Period format. """ d = td.days s = td.seconds ms = td.microseconds m, s = divmod(s,60) h, m = divmod(m,60) a = ['P'] if d: a.append( '%dD' % d ) if h or m or s or ms: a.append( 'T' ) if h: a.append( '%dH' % h ) if m: a.append( '%dM' % m ) if s or ms: if ms: a.append( '%d.%06d' % (s,ms) ) else: a.append( '%d' % s ) if len(a)==1: a.append('T0S') return ''.join(a) # ---------------------------------------------------------------------- # File position indicator # ---------------------------------------------------------------------- class position_marker(object): """A position marks a specific place in a text document. It consists of the following attributes: * line - The line number, starting at 1 * column - The column on the line, starting at 0 * char_position - The number of characters from the start of the document, starting at 0 * text_after - (optional) a short excerpt of the text of document starting at the current position Lines are separated by any Unicode line separator character. As an exception a CR+LF character pair is treated as being a single line separator demarcation. Columns are simply a measure of the number of characters after the start of a new line, starting at 0. Visual effects caused by Unicode characters such as combining characters, bidirectional text, zero-width characters and so on do not affect the computation of the column regardless of visual appearance. The char_position is a count of the number of characters since the beginning of the document, starting at 0. As used within the buffered_stream class, if the document starts with a Unicode Byte Order Mark (BOM), the BOM prefix is NOT INCLUDED in the count. """ def __init__(self, offset=0, line=1, column=0, text_after=None): self.__char_position = offset self.__line = line self.__column = column self.__text_after = text_after self.__at_end = False self.__last_was_cr = False @property def line(self): """The current line within the document, starts at 1.""" return self.__line @property def column(self): """The current character column from the beginning of the document, starts at 0. """ return self.__column @property def char_position(self): """The current character offset from the beginning of the document, starts at 0. """ return self.__char_position @property def at_start(self): """Returns True if the position is at the start of the document.""" return (self.char_position == 0) @property def at_end(self): """Returns True if the position is at the end of the document. This property must be set by the user. """ return self.__at_end @at_end.setter def at_end(self, b): """Sets the at_end property to True or False. """ self.__at_end = bool(b) @property def text_after(self): """Returns a textual excerpt starting at the current position. This property must be set by the user. """ return self.__at_end @text_after.setter def text_after(self, value): """Sets the text_after property to a given string. """ self.__text_after = value def __repr__(self): s = "%s(offset=%r,line=%r,column=%r" \ % (self.__class__.__name__, self.__char_position, self.__line, self.__column) if self.text_after: s += ",text_after=%r" % (self.text_after,) s += ")" return s def describe(self, show_text=True): """Returns a human-readable description of the position, in English.""" s = "line %d, column %d, offset %d" % (self.__line, self.__column, self.__char_position) if self.at_start: s += " (AT-START)" elif self.at_end: s += " (AT-END)" if show_text and self.text_after: s += ", text %r" % (self.text_after) return s def __str__(self): """Same as the describe() function.""" return self.describe( show_text=True ) def copy( self ): """Create a copy of the position object.""" p = self.__class__() p.__char_position = self.__char_position p.__line = self.__line p.__column = self.__column p.text_after = self.__text_after p.at_end = self.at_end p.__last_was_cr = self.__last_was_cr return p def rewind( self ): """Set the position to the start of the document.""" if not self.at_start: self.text_after = None self.at_end = False self.__char_position = 0 self.__line = 1 self.__column = 0 self.__last_was_cr = False def advance( self, s ): """Advance the position from its current place according to the given string of characters. """ if s: self.text_after = None for c in s: self.__char_position += 1 if c == '\n' and self.__last_was_cr: self.__last_was_cr = False elif helpers.char_is_unicode_eol(c): self.__line += 1 self.__column = 0 self.__last_was_cr = (c == '\r') else: self.__column += 1 self.__last_was_cr = False # ---------------------------------------------------------------------- # Buffered Stream Reader # ---------------------------------------------------------------------- class buffered_stream(object): """A helper class for the JSON parser. It allows for reading an input document, while handling some low-level Unicode issues as well as tracking the current position in terms of line and column position. """ def __init__(self, txt='', encoding=None): self.reset() self.set_text( txt, encoding ) def reset(self): """Clears the state to nothing.""" self.__pos = position_marker() self.__saved_pos = [] # Stack of saved positions self.__bom = helpers.make_raw_bytes([]) # contains copy of byte-order mark, if any self.__codec = None # The CodecInfo self.__encoding = None # The name of the codec's encoding self.__input_is_bytes = False self.__rawbuf = None self.__raw_bytes = None self.__cmax = 0 self.num_ws_skipped = 0 def save_position(self): self.__saved_pos.append( self.__pos.copy() ) return True def clear_saved_position(self): if self.__saved_pos: self.__saved_pos.pop() return True else: return False def restore_position(self): try: old_pos = self.__saved_pos.pop() # Can raise IndexError except IndexError, err: raise IndexError("Attempt to restore buffer position that was never saved") else: self.__pos = old_pos return True def _find_codec(self, encoding): if encoding is None: self.__codec = None self.__encoding = None elif isinstance(encoding, codecs.CodecInfo): self.__codec = encoding self.__encoding = self.__codec.name else: self.__encoding = encoding self.__codec = helpers.lookup_codec( encoding ) if not self.__codec: raise JSONDecodeError('no codec available for character encoding',encoding) return self.__codec def set_text( self, txt, encoding=None ): """Changes the input text document and rewinds the position to the start of the new document. """ import sys self.rewind() self.__codec = None self.__bom = None self.__rawbuf = u'' self.__cmax = 0 # max number of chars in input try: decoded = helpers.unicode_decode( txt, encoding ) except JSONError: raise except Exception, err: # Re-raise as a JSONDecodeError e2 = sys.exc_info() newerr = JSONDecodeError("a Unicode decoding error occurred") # Simulate Python 3's: "raise X from Y" exception chaining newerr.__cause__ = err newerr.__traceback__ = e2[2] raise newerr else: self.__codec = decoded.codec self.__bom = decoded.bom self.__rawbuf = decoded.string self.__cmax = len(self.__rawbuf) def __repr__(self): return '<%s at %r text %r>' % (self.__class__.__name__, self.__pos, self.text_context) def rewind(self): """Resets the position back to the start of the input text.""" self.__pos.rewind() @property def codec(self): """The codec object used to perform Unicode decoding, or None.""" return self.__codec @property def bom(self): """The Unicode Byte-Order Mark (BOM), if any, that was present at the start of the input text. The returned BOM is a string of the raw bytes, and is not Unicode-decoded. """ return self.__bom @property def cpos(self): """The current character offset from the start of the document.""" return self.__pos.char_position @property def position(self): """The current position (as a position_marker object). Returns a copy. """ p = self.__pos.copy() p.text_after = self.text_context p.at_end = self.at_end return p @property def at_start(self): """Returns True if the position is currently at the start of the document, or False otherwise. """ return self.__pos.at_start @property def at_end(self): """Returns True if the position is currently at the end of the document, of False otherwise. """ c = self.peek() return (not c) def at_ws(self, allow_unicode_whitespace=True): """Returns True if the current position contains a white-space character. """ c = self.peek() if not c: return False elif allow_unicode_whitespace: return helpers.char_is_unicode_ws(c) else: return helpers.char_is_json_ws(c) def at_eol(self, allow_unicode_eol=True): """Returns True if the current position contains an end-of-line control character. """ c = self.peek() if not c: return True # End of file is treated as end of line elif allow_unicode_eol: return helpers.char_is_unicode_eol(c) else: return helpers.char_is_json_eol(c) def peek( self, offset=0 ): """Returns the character at the current position, or at a given offset away from the current position. If the position is beyond the limits of the document size, then an empty string '' is returned. """ i = self.cpos + offset if i < 0 or i >= self.__cmax: return '' return self.__rawbuf[i] def peekstr( self, span=1, offset=0 ): """Returns one or more characters starting at the current position, or at a given offset away from the current position, and continuing for the given span length. If the offset and span go outside the limit of the current document size, then the returned string may be shorter than the requested span length. """ i = self.cpos + offset j = i + span if i < 0 or i >= self.__cmax: return '' return self.__rawbuf[i : j] @property def text_context( self, context_size = 20 ): """A short human-readable textual excerpt of the document at the current position, in English. """ context_size = max( context_size, 4 ) s = self.peekstr(context_size + 1) if not s: return '' if len(s) > context_size: s = s[:context_size - 3] + "..." return s def startswith( self, s ): """Determines if the text at the current position starts with the given string. See also method: pop_if_startswith() """ s2 = self.peekstr( len(s) ) return s == s2 def skip( self, span=1 ): """Advances the current position by one (or the given number) of characters. Will not advance beyond the end of the document. Returns the number of characters skipped. """ i = self.cpos self.__pos.advance( self.peekstr(span) ) return self.cpos - i def skipuntil( self, testfn ): """Advances the current position until a given predicate test function succeeds, or the end of the document is reached. Returns the actual number of characters skipped. The provided test function should take a single unicode character and return a boolean value, such as: lambda c : c == '.' # Skip to next period See also methods: skipwhile() and popuntil() """ i = self.cpos while True: c = self.peek() if not c or testfn(c): break else: self.__pos.advance(c) return self.cpos - i def skipwhile( self, testfn ): """Advances the current position until a given predicate test function fails, or the end of the document is reached. Returns the actual number of characters skipped. The provided test function should take a single unicode character and return a boolean value, such as: lambda c : c.isdigit() # Skip all digits See also methods: skipuntil() and popwhile() """ return self.skipuntil( lambda c: not testfn(c) ) def skip_to_next_line( self, allow_unicode_eol=True ): """Advances the current position to the start of the next line. Will not advance beyond the end of the file. Note that the two-character sequence CR+LF is recognized as being just a single end-of-line marker. """ ln = self.__pos.line while True: c = self.pop() if not c or self.__pos.line > ln: if c == '\r' and self.peek() == '\n': self.skip() break def skipws( self, allow_unicode_whitespace=True ): """Advances the current position past all whitespace, or until the end of the document is reached. """ if allow_unicode_whitespace: n = self.skipwhile( helpers.char_is_unicode_ws ) else: n = self.skipwhile( helpers.char_is_json_ws ) self.num_ws_skipped += n return n def pop( self ): """Returns the character at the current position and advances the position to the next character. At the end of the document this function returns an empty string. """ c = self.peek() if c: self.__pos.advance( c ) return c def popstr( self, span=1, offset=0 ): """Returns a string of one or more characters starting at the current position, and advances the position to the following character after the span. Will not go beyond the end of the document, so the returned string may be shorter than the requested span. """ s = self.peekstr(span) if s: self.__pos.advance( s ) return s def popif( self, testfn ): """Just like the pop() function, but only returns the character if the given predicate test function succeeds. """ c = self.peek() if c and testfn(c): self.__pos.advance( c ) return c return '' def pop_while_in( self, chars ): """Pops a sequence of characters at the current position as long as each of them is in the given set of characters. """ if not isinstance( chars, (set,frozenset)): cset = set( chars ) c = self.peek() if c and c in cset: s = self.popwhile( lambda c: c and c in cset ) return s return None def pop_identifier( self, match=None ): """Pops the sequence of characters at the current position that match the syntax for a JavaScript identifier. """ c = self.peek() if c and helpers.char_is_identifier_leader(c): s = self.popwhile( helpers.char_is_identifier_tail ) return s return None def pop_if_startswith( self, s ): """Pops the sequence of characters if they match the given string. See also method: startswith() """ s2 = self.peekstr( len(s) ) if s2 != s: return NULL self.__pos.advance( s2 ) return s2 def popwhile( self, testfn, maxchars=None ): """Pops all the characters starting at the current position as long as each character passes the given predicate function test. If maxchars a numeric value instead of None then then no more than that number of characters will be popped regardless of the predicate test. See also methods: skipwhile() and popuntil() """ s = [] i = 0 while maxchars is None or i < maxchars: c = self.popif( testfn ) if not c: break s.append( c ) i += 1 return ''.join(s) def popuntil( self, testfn, maxchars=None ): """Just like popwhile() method except the predicate function should return True to stop the sequence rather than False. See also methods: skipuntil() and popwhile() """ return popwhile( lambda c: not testfn(c), maxchars=maxchars ) def __getitem__( self, index ): """Returns the character at the given index relative to the current position. If the index goes beyond the end of the input, or prior to the start when negative, then '' is returned. If the index provided is a slice object, then that range of characters is returned as a string. Note that a stride value other than 1 is not supported in the slice. To use a slice, do: s = my_stream[ 1:4 ] """ if isinstance( index, slice ): return self.peekstr( index.stop - index.start, index.start ) else: return self.peek( index ) # ---------------------------------------------------------------------- # Exception classes. # ---------------------------------------------------------------------- class JSONException(Exception): """Base class for all JSON-related exceptions. """ pass class JSONSkipHook(JSONException): """An exception to be raised by user-defined code within hook callbacks to indicate the callback does not want to handle the situation. """ pass class JSONStopProcessing(JSONException): """Can be raised by anyplace, including inside a hook function, to cause the entire encode or decode process to immediately stop with an error. """ pass class JSONAbort(JSONException): pass class JSONError(JSONException): """Base class for all JSON-related errors. In addition to standard Python exceptions, these exceptions may also have additional properties: * severity - One of: 'fatal', 'error', 'warning', 'info' * position - An indication of the position in the input where the error occured. * outer_position - A secondary position (optional) that gives the location of the outer data item in which the error occured, such as the beginning of a string or an array. * context_description - A string that identifies the context in which the error occured. Default is "Context". """ severities = frozenset(['fatal','error','warning','info']) def __init__(self, message, *args, **kwargs ): self.severity = 'error' self._position = None self.outer_position = None self.context_description = None for kw,val in kwargs.items(): if kw == 'severity': if val not in self.severities: raise TypeError("%s given invalid severity %r" % (self.__class__.__name__, val)) self.severity = val elif kw == 'position': self.position = val elif kw == 'outer_position': self.outer_position = val elif kw == 'context_description' or kw=='context': self.context_description = val else: raise TypeError("%s does not accept %r keyword argument" % (self.__class__.__name__, kw)) super( JSONError, self ).__init__( message, *args ) self.message = message @property def position(self): return self._position @position.setter def position(self, pos): if pos == 0: self._position = 0 #position_marker() # start of input else: self._position = pos def __repr__(self): s = "%s(%r" % (self.__class__.__name__, self.message) for a in self.args[1:]: s += ", %r" % (a,) if self.position: s += ", position=%r" % (self.position,) if self.outer_position: s += ", outer_position=%r" % (self.outer_position,) s += ", severity=%r)" % (self.severity,) return s def pretty_description(self, show_positions=True, filename=None): if filename: pfx = filename.rstrip().rstrip(':') + ':' else: pfx = '' # Print file position as numeric abbreviation err = pfx if self.position == 0: err += '0:0:' elif self.position: err += '%d:%d:' % (self.position.line, self.position.column) else: err += ' ' # Print severity and main error message err += " %s: %s" % (self.severity.capitalize(), self.message) if len(self.args) > 1: err += ': ' for anum, a in enumerate(self.args[1:]): if anum > 1: err += ', ' astr = repr(a) if len(astr) > 30: astr = astr[:30] + '...' err += astr # Print out exception chain e2 = self while e2: if hasattr(e2,'__cause__') and isinstance(e2.__cause__,Exception): e2 = e2.__cause__ e2desc = str(e2).strip() if not e2desc: e2desc = repr(e2).strip() err += "\n | Cause: %s" % e2desc.strip().replace('\n','\n | ') else: e2 = None # Show file position if show_positions and self.position is not None: if self.position == 0: err += "\n | At start of input" else: err += "\n | At %s" % (self.position.describe(show_text=False),) if self.position.text_after: err += "\n | near text: %r" % (self.position.text_after,) # Show context if show_positions and self.outer_position: if self.context_description: cdesc = self.context_description.capitalize() else: cdesc = "Context" err += "\n | %s started at %s" % (cdesc, self.outer_position.describe(show_text=False),) if self.outer_position.text_after: err += "\n | with text: %r" % (self.outer_position.text_after,) return err class JSONDecodeError(JSONError): """An exception class raised when a JSON decoding error (syntax error) occurs.""" pass class JSONDecodeHookError(JSONDecodeError): """An exception that occured within a decoder hook. The original exception is available in the 'hook_exception' attribute. """ def __init__(self, hook_name, exc_info, encoded_obj, *args, **kwargs): self.hook_name = hook_name if not exc_info: exc_info = (None, None, None) exc_type, self.hook_exception, self.hook_traceback = exc_info self.object_type = type(encoded_obj) msg = "Hook %s raised %r while decoding type <%s>" % (hook_name, self.hook_exception.__class__.__name__, self.object_type.__name__) if len(args) >= 1: msg += ": " + args[0] args = args[1:] super(JSONDecodeHookError,self).__init__(msg, *args,**kwargs) class JSONEncodeError(JSONError): """An exception class raised when a python object can not be encoded as a JSON string.""" pass class JSONEncodeHookError(JSONEncodeError): """An exception that occured within an encoder hook. The original exception is available in the 'hook_exception' attribute. """ def __init__(self, hook_name, exc_info, encoded_obj, *args, **kwargs): self.hook_name = hook_name if not exc_info: exc_info = (None, None, None) exc_type, self.hook_exception, self.hook_traceback = exc_info self.object_type = type(encoded_obj) msg = "Hook %s raised %r while encoding type <%s>" % (self.hook_name, self.hook_exception.__class__.__name__, self.object_type.__name__) if len(args) >= 1: msg += ": " + args[0] args = args[1:] super(JSONEncodeHookError,self).__init__(msg, *args, **kwargs) #---------------------------------------------------------------------- # Encoder state object #---------------------------------------------------------------------- class encode_state(object): """An internal transient object used during JSON encoding to record the current construction state. """ def __init__(self, jsopts=None, parent=None ): import sys self.chunks = [] if not parent: self.parent = None self.nest_level = 0 self.options = jsopts self.escape_unicode_test = False # or a function f(unichar)=>True/False else: self.parent = parent self.nest_level = parent.nest_level + 1 self.escape_unicode_test = parent.escape_unicode_test self.options = parent.options def make_substate(self): return encode_state( parent=self ) def join_substate(self, other_state): self.chunks.extend( other_state.chunks ) other_state.chunks = [] def append(self, s): """Adds a string to the end of the current JSON document""" self.chunks.append(s) def combine(self): """Returns the accumulated string and resets the state to empty""" s = ''.join( self.chunks ) self.chunks = [] return s def __eq__(self, other_state): return self.nest_level == other_state.nest_level and \ self.chunks == other_state.chunks def __lt__(self, other_state): if self.nest_level != other_state.nest_level: return self.nest_level < other_state.nest_level return self.chunks < other_state.chunks #---------------------------------------------------------------------- # Decoder statistics #---------------------------------------------------------------------- class decode_statistics(object): """An object that records various statistics about a decoded JSON document. """ int8_max = 0x7f int8_min = - 0x7f - 1 int16_max = 0x7fff int16_min = - 0x7fff - 1 int32_max = 0x7fffffff int32_min = - 0x7fffffff - 1 int64_max = 0x7fffffffffffffff int64_min = - 0x7fffffffffffffff - 1 double_int_max = 2**53 - 1 double_int_min = - (2**53 - 1) def __init__(self): # Nesting self.max_depth = 0 self.max_items_in_array = 0 self.max_items_in_object = 0 # Integer stats self.num_ints = 0 self.num_ints_8bit = 0 self.num_ints_16bit = 0 self.num_ints_32bit = 0 self.num_ints_53bit = 0 # ints which will overflow IEEE doubles self.num_ints_64bit = 0 self.num_ints_long = 0 self.num_negative_zero_ints = 0 # Floating-point stats self.num_negative_zero_floats = 0 self.num_floats = 0 self.num_floats_decimal = 0 # overflowed 'float' # String stats self.num_strings = 0 self.max_string_length = 0 self.total_string_length = 0 self.min_codepoint = None self.max_codepoint = None # Other data type stats self.num_arrays = 0 self.num_objects = 0 self.num_bools = 0 self.num_nulls = 0 self.num_undefineds = 0 self.num_nans = 0 self.num_infinities = 0 self.num_comments = 0 self.num_identifiers = 0 # JavaScript identifiers self.num_excess_whitespace = 0 @property def num_infinites(self): """Misspelled 'num_infinities' for backwards compatibility""" return self.num_infinities def pretty_description(self, prefix=''): import unicodedata lines = [ "Number of integers:", " 8-bit: %5d (%d to %d)" % (self.num_ints_8bit, self.int8_min, self.int8_max), " 16-bit: %5d (%d to %d)" % (self.num_ints_16bit, self.int16_min, self.int16_max), " 32-bit: %5d (%d to %d)" % (self.num_ints_32bit, self.int32_min, self.int32_max), " > 53-bit: %5d (%d to %d - overflows JavaScript)" % (self.num_ints_53bit, self.double_int_min, self.double_int_max), " 64-bit: %5d (%d to %d)" % (self.num_ints_64bit, self.int64_min, self.int64_max), " > 64 bit: %5d (not portable, may require a \"Big Num\" package)" % self.num_ints_long, " total ints: %5d" % self.num_ints, " Num -0: %5d (negative-zero integers are not portable)" % self.num_negative_zero_ints, "Number of floats:", " doubles: %5d" % self.num_floats, " > doubles: %5d (will overflow IEEE doubles)" % self.num_floats_decimal, " total flts: %5d" % (self.num_floats + self.num_floats_decimal), " Num -0.0: %5d (negative-zero floats are usually portable)" % self.num_negative_zero_floats, "Number of:", " nulls: %5d" % self.num_nulls, " booleans: %5d" % self.num_bools, " arrays: %5d" % self.num_arrays, " objects: %5d" % self.num_objects, "Strings:", " number: %5d strings" % self.num_strings, " max length: %5d characters" % self.max_string_length, " total chars: %5d across all strings" % self.total_string_length, ] if self.min_codepoint is not None: cp = 'U+%04X' % self.min_codepoint try: charname = unicodedata.name(unichr(self.min_codepoint)) except ValueError: charname = '? UNKNOWN CHARACTER' lines.append(" min codepoint: %6s (%s)" % (cp, charname)) else: lines.append(" min codepoint: %6s" % ('n/a',)) if self.max_codepoint is not None: cp = 'U+%04X' % self.max_codepoint try: charname = unicodedata.name(unichr(self.max_codepoint)) except ValueError: charname = '? UNKNOWN CHARACTER' lines.append(" max codepoint: %6s (%s)" % (cp, charname)) else: lines.append(" max codepoint: %6s" % ('n/a',)) lines.extend([ "Other JavaScript items:", " NaN: %5d" % self.num_nans, " Infinite: %5d" % self.num_infinities, " undefined: %5d" % self.num_undefineds, " Comments: %5d" % self.num_comments, " Identifiers: %5d" % self.num_identifiers, "Max items in any array: %5d" % self.max_items_in_array, "Max keys in any object: %5d" % self.max_items_in_object, "Max nesting depth: %5d" % self.max_depth, ]) if self.total_chars == 0: lines.append("Unnecessary whitespace: 0 of 0 characters") else: lines.append( "Unnecessary whitespace: %5d of %d characters (%.2f%%)" \ % (self.num_excess_whitespace, self.total_chars, self.num_excess_whitespace * 100.0 / self.total_chars) ) if prefix: return '\n'.join([ prefix+s for s in lines ]) + '\n' else: return '\n'.join( lines ) + '\n' #---------------------------------------------------------------------- # Decoder state object #---------------------------------------------------------------------- class decode_state(object): """An internal transient object used during JSON decoding to record the current parsing state and error messages. """ def __init__(self, options=None): self.reset() self.options = options def reset(self): """Clears all errors, statistics, and input text.""" self.buf = None self.errors = [] self.obj = None self.cur_depth = 0 # how deep in nested structures are we? self.stats = decode_statistics() self._have_warned_nonbmp = False self._have_warned_long_string = False self._have_warned_max_depth = False @property def should_stop(self): if self.has_fatal: return True return False @property def has_errors(self): """Have any errors been seen already?""" return len([err for err in self.errors if err.severity in ('fatal','error')]) > 0 @property def has_fatal(self): """Have any errors been seen already?""" return len([err for err in self.errors if err.severity in ('fatal',)]) > 0 def set_input( self, txt, encoding=None ): """Initialize the state by setting the input document text.""" import sys self.reset() try: self.buf = buffered_stream( txt, encoding=encoding ) except JSONError as err: err.position = 0 # set position to start of file err.severity = 'fatal' self.push_exception( err ) except Exception as err: # Re-raise as JSONDecodeError e2 = sys.exc_info() newerr = JSONDecodeError("Error while reading input", position=0, severity='fatal') self.push_exception( err ) self.buf = None else: if self.buf.bom: self.push_cond( self.options.bom, "JSON document was prefixed by a BOM (Byte Order Mark)", self.buf.bom ) if not self.buf: self.push_fatal( "Aborting, can not read JSON document.", position=0 ) def push_exception(self, exc): """Add an already-built exception to the error list.""" self.errors.append(exc) def push_fatal(self, message, *args, **kwargs): """Create a fatal error.""" kwargs['severity'] = 'fatal' self.__push_err( message, *args, **kwargs) def push_error(self, message, *args, **kwargs): """Create an error.""" kwargs['severity'] = 'error' self.__push_err( message, *args, **kwargs) def push_warning(self, message, *args, **kwargs): """Create a warning.""" kwargs['severity'] = 'warning' self.__push_err( message, *args, **kwargs) def push_info(self, message, *args, **kwargs): """Create a informational message.""" kwargs['severity'] = 'info' self.__push_err( message, *args, **kwargs) def push_cond(self, behavior_value, message, *args, **kwargs): """Creates an conditional error or warning message. The behavior value (from json_options) controls whether a message will be pushed and whether it is an error or warning message. """ if behavior_value == ALLOW: return elif behavior_value == WARN: kwargs['severity'] = 'warning' else: kwargs['severity'] = 'error' self.__push_err( message, *args, **kwargs ) def __push_err(self, message, *args, **kwargs): """Stores an error in the error list.""" position = None outer_position = None severity = 'error' context_description = None for kw, val in kwargs.items(): if kw == 'position': position = val elif kw == 'outer_position': outer_position = val elif kw == 'severity': severity = val elif kw == 'context_description' or kw == 'context': context_description=val else: raise TypeError('Unknown keyword argument',kw) if position is None and self.buf: position = self.buf.position # Current position err = JSONDecodeError( message, position=position, outer_position=outer_position, context_description=context_description, severity=severity, *args) self.push_exception( err ) def update_depth_stats(self, **kwargs): st = self.stats st.max_depth = max(st.max_depth, self.cur_depth) if not self._have_warned_max_depth and self.cur_depth > self.options.warn_max_depth: self._have_warned_max_depth = True self.push_cond( self.options.non_portable, "Arrays or objects nested deeper than %d levels may not be portable" \ % self.options.warn_max_depth ) def update_string_stats(self, s, **kwargs): st = self.stats st.num_strings += 1 st.max_string_length = max(st.max_string_length, len(s)) st.total_string_length += len(s) if self.options.warn_string_length and len(s) > self.options.warn_string_length and not self._have_warned_long_string: self._have_warned_long_string = True self.push_cond( self.options.non_portable, "Strings longer than %d may not be portable" % self.options.warn_string_length, **kwargs ) if len(s) > 0: mincp = ord(min(s)) maxcp = ord(max(s)) if st.min_codepoint is None: st.min_codepoint = mincp st.max_codepoint = maxcp else: st.min_codepoint = min( st.min_codepoint, mincp ) st.max_codepoint = max( st.max_codepoint, maxcp ) if maxcp > 0xffff and not self._have_warned_nonbmp: self._have_warned_nonbmp = True self.push_cond( self.options.non_portable, "Strings containing non-BMP characters (U+%04X) may not be portable" % maxcp, **kwargs ) def update_negzero_int_stats(self, **kwargs): st = self.stats st.num_negative_zero_ints += 1 if st.num_negative_zero_ints == 1: # Only warn once self.push_cond( self.options.non_portable, "Negative zero (-0) integers are usually not portable", **kwargs ) def update_negzero_float_stats(self, **kwargs): st = self.stats st.num_negative_zero_floats += 1 if st.num_negative_zero_floats == 1: # Only warn once self.push_cond( self.options.non_portable, "Negative zero (-0.0) numbers may not be portable", **kwargs) def update_float_stats(self, float_value, **kwargs): st = self.stats if 'sign' in kwargs: del kwargs['sign'] if helpers.is_negzero( float_value ): self.update_negzero_float_stats( **kwargs ) if helpers.is_infinite( float_value ): st.num_infinities += 1 if isinstance(float_value, decimal.Decimal): st.num_floats_decimal += 1 if st.num_floats_decimal == 1: # Only warn once self.push_cond( self.options.non_portable, "Floats larger or more precise than an IEEE \"double\" may not be portable", **kwargs) elif isinstance(float_value, float): st.num_floats += 1 def update_integer_stats(self, int_value, **kwargs ): sign=kwargs.get('sign', 1) if 'sign' in kwargs: del kwargs['sign'] if int_value == 0 and sign < 0: self.update_negzero_int_stats( **kwargs ) if sign < 0: int_value = - int_value st = self.stats st.num_ints += 1 if st.int8_min <= int_value <= st.int8_max: st.num_ints_8bit += 1 elif st.int16_min <= int_value <= st.int16_max: st.num_ints_16bit += 1 elif st.int32_min <= int_value <= st.int32_max: st.num_ints_32bit += 1 elif st.int64_min <= int_value <= st.int64_max: st.num_ints_64bit += 1 else: st.num_ints_long += 1 if int_value < st.double_int_min or st.double_int_max < int_value: st.num_ints_53bit += 1 if st.num_ints_53bit == 1: # Only warn once self.push_cond( self.options.non_portable, "Integers larger than 53-bits are not portable", **kwargs ) # ---------------------------------------------------------------------- # JSON strictness options # ---------------------------------------------------------------------- STRICTNESS_STRICT = 'strict' STRICTNESS_WARN = 'warn' STRICTNESS_TOLERANT = 'tolerant' ALLOW = 'allow' WARN = 'warn' FORBID = 'forbid' # For float_type option NUMBER_AUTO = 'auto' NUMBER_FLOAT = 'float' NUMBER_DECIMAL = 'decimal' # For json_int class NUMBER_FORMAT_DECIMAL = 'decimal' NUMBER_FORMAT_HEX = 'hex' NUMBER_FORMAT_LEGACYOCTAL = 'legacyoctal' NUMBER_FORMAT_OCTAL = 'octal' NUMBER_FORMAT_BINARY = 'binary' class _behaviors_metaclass(type): """Meta class used to establish a set of "behavior" options. Classes that use this meta class must defined a class-level variable called '_behaviors' that is a list of tuples, each of which describes one behavior and is like: (behavior_name, documentation). Also define a second class-level variable called '_behavior_values' which is a list of the permitted values for each behavior, each being strings. For each behavior (e.g., pretty), and for each value (e.g., yes) the following methods/properties will be created: * pretty - value of 'pretty' behavior (read-write) * ispretty_yes - returns True if 'pretty' is 'yes' For each value (e.g., pink) the following methods/properties will be created: * all_behaviors - set of all behaviors (read-only) * pink_behaviors - set of behaviors with value of 'pink' (read-only) * set_all('pink') * set_all_pink() - set all behaviors to value of 'pink' """ def __new__(cls, clsname, bases, attrs): values = attrs.get('_behavior_values') attrs['values'] = property( lambda self: set(self._behavior_values), doc='Set of possible behavior values') behaviors = attrs.get('_behaviors') def get_behavior(self, name): """Returns the value for a given behavior""" try: return getattr( self, '_behavior_'+name ) except AttributeError: raise ValueError('Unknown behavior',name) attrs['get_behavior'] = get_behavior def set_behavior(self, name, value): """Changes the value for a given behavior""" if value not in self._behavior_values: raise ValueError('Unknown value for behavior',value) varname = '_behavior_'+name if hasattr(self,varname): setattr( self, varname, value ) else: raise ValueError('Unknown behavior',name) attrs['set_behavior'] = set_behavior def describe_behavior(self,name): """Returns documentation about a given behavior.""" for n, doc in self._behaviors: if n==name: return doc else: raise AttributeError('No such behavior',name) attrs['describe_behavior'] = describe_behavior for name, doc in behaviors: attrs['_behavior_'+name] = True for v in values: vs = v + '_' + name def getx(self,name=name,forval=v): return self.get_behavior(name) == forval attrs['is_'+v+'_'+name] = property(getx,doc=v.capitalize()+' '+doc) # method value_name() fnset = lambda self,_name=name,_value=v: self.set_behavior(_name,_value) fnset.__name__ = v+'_'+name fnset.__doc__ = 'Set behavior ' + name + ' to ' + v + "." attrs[fnset.__name__] = fnset def get_value_for_behavior(self,name=name): return self.get_behavior(name) def set_value_for_behavior(self,value,name=name): self.set_behavior(name,value) attrs[name] = property(get_value_for_behavior,set_value_for_behavior,doc=doc) @property def all_behaviors(self): """Returns the names of all known behaviors.""" return set([t[0] for t in self._behaviors]) attrs['all_behaviors'] = all_behaviors def set_all(self,value): """Changes all behaviors to have the given value.""" if value not in self._behavior_values: raise ValueError('Unknown behavior',value) for name in self.all_behaviors: setattr(self, '_behavior_'+name, value) attrs['set_all'] = set_all def is_all(self,value): """Determines if all the behaviors have the given value.""" if value not in self._behavior_values: raise ValueError('Unknown behavior',value) for name in self.all_behaviors: if getattr(self, '_behavior_'+name) != value: return False return True attrs['is_all'] = is_all for v in values: # property value_behaviors def getbehaviorsfor(self,value=v): return set([name for name in self.all_behaviors if getattr(self,name)==value]) attrs[v+'_behaviors'] = property(getbehaviorsfor,doc='Return the set of behaviors with the value '+v+'.') # method set_all_value() setfn = lambda self,_value=v: set_all(self,_value) setfn.__name__ = 'set_all_'+v setfn.__doc__ = 'Set all behaviors to value ' + v + "." attrs[setfn.__name__] = setfn # property is_all_value attrs['is_all_'+v] = property( lambda self,v=v: is_all(self,v), doc='Determines if all the behaviors have the value '+v+'.') def behaviors_eq(self, other): """Determines if two options objects are equivalent.""" if self.all_behaviors != other.all_behaviors: return False return self.allowed_behaviors == other.allowed_behaviors attrs['__eq__'] = behaviors_eq return super(_behaviors_metaclass, cls).__new__(cls, clsname, bases, attrs) SORT_NONE = 'none' SORT_PRESERVE = 'preserve' SORT_ALPHA = 'alpha' SORT_ALPHA_CI = 'alpha_ci' SORT_SMART = 'smart' sorting_methods = { SORT_NONE: "Do not sort, resulting order may be random", SORT_PRESERVE: "Preserve original order when reformatting", SORT_ALPHA: "Sort strictly alphabetically", SORT_ALPHA_CI: "Sort alphabetically case-insensitive", SORT_SMART: "Sort alphabetically and numerically (DEFAULT)" } sorting_method_aliases = { 'ci': SORT_ALPHA_CI } def smart_sort_transform( key ): numfmt = '%012d' digits = '0123456789' zero = ord('0') if not key: key = '' elif isinstance( key, (int,long) ): key = numfmt % key elif isinstance( key, basestring ): keylen = len(key) words = [] i=0 while i < keylen: if key[i] in digits: num = 0 while i < keylen and key[i] in digits: num *= 10 num += ord(key[i]) - zero i += 1 words.append( numfmt % num ) else: words.append( key[i].upper() ) i += 1 key = ''.join(words) else: key = str(key) return key # Find Enum type (introduced in Python 3.4) try: from enum import Enum as _enum except ImportError: _enum = None # Find OrderedDict type try: from collections import OrderedDict as _OrderedDict except ImportError: _OrderedDict = None class json_options(object): """Options to determine how strict the decoder or encoder should be.""" __metaclass__ = _behaviors_metaclass _behavior_values = (ALLOW, WARN, FORBID) _behaviors = ( ("all_numeric_signs", "Numbers may be prefixed by any \'+\' and \'-\', e.g., +4, -+-+77"), ("any_type_at_start", "A JSON document may start with any type, not just arrays or objects"), ("comments", "JavaScript comments, both /*...*/ and //... styles"), ("control_char_in_string", "Strings may contain raw control characters without \\u-escaping"), ("hex_numbers", "Hexadecimal numbers, e.g., 0x1f"), ("binary_numbers", "Binary numbers, e.g., 0b1001"), ("octal_numbers", "New-style octal numbers, e.g., 0o731 (see leading-zeros for legacy octals)"), ("initial_decimal_point", "Floating-point numbers may start with a decimal point (no units digit)"), ("extended_unicode_escapes", "Extended Unicode escape sequence \\u{..} for non-BMP characters"), ("js_string_escapes", "All JavaScript character \\-escape sequences may be in strings"), ("leading_zeros", "Numbers may have extra leading zeros (see --leading-zero-radix option)"), ("non_numbers", "Non-numbers may be used, such as NaN or Infinity"), ("nonescape_characters", "Unknown character \\-escape sequences stand for that character (\\Q -> 'Q')"), ("identifier_keys", "JavaScript identifiers are converted to strings when used as object keys"), ("nonstring_keys", "Value types other than strings (or identifiers) may be used as object keys"), ("omitted_array_elements", "Arrays may have omitted/elided elements, e.g., [1,,3] == [1,undefined,3]"), ("single_quoted_strings", "Strings may be delimited with both double (\") and single (\') quotation marks"), ("trailing_comma", "A final comma may end the list of array or object members"), ("trailing_decimal_point", "Floating-point number may end with a decimal point and no following fractional digits"), ("undefined_values", "The JavaScript 'undefined' value may be used"), ("format_control_chars", "Unicode \"format control characters\" may appear in the input"), ("unicode_whitespace", "Treat any Unicode whitespace character as valid whitespace"), # Never legal ("leading_zeros", "Numbers may have leading zeros"), # Normally warnings ("duplicate_keys", "Objects may have repeated keys"), ("zero_byte", "Strings may contain U+0000, which may not be safe for C-based programs"), ("bom", "A JSON document may start with a Unicode BOM (Byte Order Mark)"), ("non_portable", "Anything technically valid but likely to cause data portablibity issues"), ) # end behavior list def reset_to_defaults(self): # Plain attrs (other than above behaviors) are simply copied # by value, either during initialization (via keyword # arguments) or via the copy() method. self._plain_attrs = ['leading_zero_radix', 'encode_namedtuple_as_object', 'encode_enum_as', 'encode_compactly', 'escape_unicode', 'always_escape_chars', 'warn_string_length', 'warn_max_depth', 'int_as_float', 'decimal_context', 'float_type', 'keep_format', 'date_format', 'datetime_format', 'time_format', 'timedelta_format', 'sort_keys', 'indent_amount', 'indent_tab_width', 'indent_limit', 'max_items_per_line', 'py2str_encoding' ] self.strictness = STRICTNESS_WARN self._leading_zero_radix = 8 # via property: leading_zero_radix self._sort_keys = SORT_SMART # via property: sort_keys self.int_as_float = False self.float_type = NUMBER_AUTO self.decimal_context = (decimal.DefaultContext if decimal else None) self.keep_format = False # keep track of when numbers are hex, octal, etc. self.encode_namedtuple_as_object = True self._encode_enum_as = 'name' # via property self.encode_compactly = True self.escape_unicode = False self.always_escape_chars = None # None, or a set of Unicode characters to always escape self.warn_string_length = 0xfffd # with 16-bit length prefix self.warn_max_depth = 64 self.date_format = 'iso' # or strftime format self.datetime_format = 'iso' # or strftime format self.time_format = 'iso' # or strftime format self.timedelta_format = 'iso' # or 'hms' self.sort_keys = SORT_ALPHA self.indent_amount = 2 self.indent_tab_width = 0 # 0, or number of equivalent spaces self.indent_limit = None self.max_items_per_line = 1 # When encoding how many items per array/object # before breaking into multiple lines # For interpreting Python 2 'str' types: if _py_major == 2: self.py2str_encoding = 'ascii' else: self.py2str_encoding = None def __init__(self, **kwargs): """Set JSON encoding and decoding options. If 'strict' is set to True, then only strictly-conforming JSON output will be produced. Note that this means that some types of values may not be convertable and will result in a JSONEncodeError exception. If 'compactly' is set to True, then the resulting string will have all extraneous white space removed; if False then the string will be "pretty printed" with whitespace and indentation added to make it more readable. If 'escape_unicode' is set to True, then all non-ASCII characters will be represented as a unicode escape sequence; if False then the actual real unicode character will be inserted if possible. The 'escape_unicode' can also be a function, which when called with a single argument of a unicode character will return True if the character should be escaped or False if it should not. """ self.reset_to_defaults() if 'strict' in kwargs: # Do this keyword first, so other keywords may override specific behaviors self.strictness = kwargs['strict'] for kw,val in kwargs.items(): if kw == 'compactly': # alias for 'encode_compactly' self.encode_compactly = val elif kw == 'strict': pass # Already handled elif kw == 'warnings': if val: self.suppress_warnings() elif kw == 'html_safe' or kw == 'xml_safe': if bool(val): if self.always_escape_chars is None: self.always_escape_chars = set(u'<>/&') else: self.always_escape_chars.update( set(u'<>/&') ) elif kw == 'always_escape': if val: if self.always_escape_chars is None: self.always_escape_chars = set(val) else: self.always_escape_chars.update( set(val) ) elif kw == 'int_as_float': self.int_as_float = bool(val) elif kw == 'keep_format': self.keep_format = bool(val) elif kw == 'float_type': if val in (NUMBER_AUTO, NUMBER_FLOAT, NUMBER_DECIMAL): self.float_type = val else: raise ValueError("Unknown option %r for argument %r to initialize %s" % (val,kw,self.__class__.__name__)) elif kw == 'decimal' or kw == 'decimal_context': if decimal: if not val or val == 'default': self.decimal_context = decimal.DefaultContext elif val == 'basic': self.decimal_context = decimal.BasicContext elif val == 'extended': self.decimal_context = decimal.ExtendedContext elif isinstance(val, decimal.Context): self.decimal_context = val elif isinstance(val,(int,long)) or val[0].isdigit: prec = int(val) self.decimal_context = decimal.Context( prec=prec ) else: raise ValueError("Option for %r should be a decimal.Context, a number of significant digits, or one of 'default','basic', or 'extended'." % (kw,)) elif kw in ('allow','warn','forbid','prevent','deny'): action = {'allow':ALLOW, 'warn':WARN, 'forbid':FORBID, 'prevent':FORBID, 'deny':FORBID}[ kw ] if isinstance(val,basestring): val = [b.replace('-','_') for b in val.replace(',',' ').split()] for behavior in val: self.set_behavior( behavior, action ) elif kw.startswith('allow_') or kw.startswith('forbid_') or kw.startswith('prevent_') or kw.startswith('deny_') or kw.startswith('warn_'): action, behavior = kw.split('_',1) if action == 'allow': if val: self.set_behavior( behavior, ALLOW ) else: self.set_behavior( behavior, FORBID ) elif action in ('forbid','prevent','deny'): if val: self.set_behavior( behavior, FORBID ) else: self.set_behavior( behavior, ALLOW ) elif action == 'warn': if val: self.set_behavior( behavior, WARN ) else: self.set_behavior( behavior, ALLOW ) elif kw in self._plain_attrs: setattr(self, kw, val) else: raise ValueError("Unknown keyword argument %r to initialize %s" % (kw,self.__class__.__name__)) def copy(self): other = self.__class__() other.copy_from( self ) return other def copy_from(self, other): if self is other: return # Myself! self.strictness = other.strictness # sets behaviors in bulk for name in self.all_behaviors: self.set_behavior( name, other.get_behavior(name) ) for name in self._plain_attrs: val = getattr(other,name) if isinstance(val, set): val = val.copy() elif decimal and isinstance(val, decimal.Decimal): val = val.copy() setattr(self, name, val) def spaces_to_next_indent_level( self, min_spaces=1, subtract=0 ): n = self.indent_amount - subtract if n < 0: n = 0 n = max( min_spaces, n ) return ' ' * n def indentation_for_level( self, level=0 ): """Returns a whitespace string used for indenting.""" if self.indent_limit is not None and level > self.indent_limit: n = self.indent_limit else: n = level n *= self.indent_amount if self.indent_tab_width: tw, sw = divmod(n, self.indent_tab_width) return '\t'*tw + ' '*sw else: return ' ' * n def set_indent( self, num_spaces, tab_width=0, limit=None ): """Changes the indentation properties when outputting JSON in non-compact mode. 'num_spaces' is the number of spaces to insert for each level of indentation, which defaults to 2. 'tab_width', if not 0, is the number of spaces which is equivalent to one tab character. Tabs will be output where possible rather than runs of spaces. 'limit', if not None, is the maximum indentation level after which no further indentation will be output. """ n = int(num_spaces) if n < 0: raise ValueError("indentation amount can not be negative",n) self.indent_amount = n self.indent_tab_width = tab_width self.indent_limit = limit @property def sort_keys(self): """The method used to sort dictionary keys when encoding JSON """ return self._sort_keys @sort_keys.setter def sort_keys(self, method): if not method: self._sort_keys = SORT_NONE elif callable(method): self._sort_keys = method elif method in sorting_methods: self._sort_keys = method elif method in sorting_method_aliases: # alias self._sort_keys = sorting_method_aliases[method] elif method == True: self._sort_keys = SORT_ALPHA else: raise ValueError("Not a valid sorting method: %r" % method) @property def encode_enum_as(self): """The strategy for encoding Python Enum values. """ return self._encode_enum_as @encode_enum_as.setter def encode_enum_as(self, val): if val not in ('name','qname','value'): raise ValueError("encode_enum_as must be one of 'name','qname', or 'value'") self._encode_enum_as = val @property def zero_float(self): """The numeric value 0.0, either a float or a decimal.""" if decimal and self.float_type == NUMBER_DECIMAL: return self.decimal_context.create_decimal('0.0') else: return 0.0 @property def negzero_float(self): """The numeric value -0.0, either a float or a decimal.""" if decimal and self.float_type == NUMBER_DECIMAL: return self.decimal_context.create_decimal('-0.0') else: return -0.0 @property def nan(self): """The numeric value NaN, either a float or a decimal.""" if decimal and self.float_type == NUMBER_DECIMAL: return self.decimal_context.create_decimal('NaN') else: return nan @property def inf(self): """The numeric value Infinity, either a float or a decimal.""" if decimal and self.float_type == NUMBER_DECIMAL: return self.decimal_context.create_decimal('Infinity') else: return inf @property def neginf(self): """The numeric value -Infinity, either a float or a decimal.""" if decimal and self.float_type == NUMBER_DECIMAL: return self.decimal_context.create_decimal('-Infinity') else: return neginf def make_int( self, s, sign=None, number_format=NUMBER_FORMAT_DECIMAL ): """Makes an integer value according to the current options. First argument should be a string representation of the number, or an integer. Returns a number value, which could be an int, float, or decimal. """ if isinstance(sign, (int,long)): if sign < 0: sign = '-' else: sign = '+' if isinstance(s,basestring): if s.startswith('-') or s.startswith('+'): sign = s[0] s = s[1:] if self.int_as_float: # Making a float/decimal if isinstance(s, (int,long)): if self.float_type == NUMBER_DECIMAL: n = self.decimal_context.create_decimal( s ) if sign=='-': n = n.copy_negate() elif s == 0 and sign=='-': n = self.negzero_float elif -999999999999999 <= s <= 999999999999999: n = float(s) if sign=='-': n *= -1 else: n = float(s) if (n == inf or int(n) != s) and self.float_type != NUMBER_FLOAT: n = self.decimal_context.create_decimal( s ) if sign=='-': n = n.copy_negate() elif sign=='-': n *= -1 else: # not already an int n = self.make_float( s, sign ) n2 = self.make_float( s[:-1] + ('9' if s[-1]<='5' else '0'), sign ) if (n==inf or n==n2) and self.float_type != NUMBER_FLOAT: n = self.make_decimal( s, sign ) elif isinstance( s, (int,long) ): # already an integer n = s if sign=='-': if n == 0: n = self.negzero_float else: n *= -1 else: # Making an actual integer try: n = int( s ) except ValueError: n = self.nan else: if sign=='-': if n==0: n = self.negzero_float else: n *= -1 if isinstance(n,(int,long)) and self.keep_format: n = json_int(n, number_format=number_format) return n def make_decimal( self, s, sign='+' ): """Converts a string into a decimal or float value.""" if not decimal or self.float_type == NUMBER_FLOAT: return self.make_float( s, sign ) if s.startswith('-') or s.startswith('+'): sign = s[0] s = s[1:] elif isinstance(sign, (int,long)): if sign < 0: sign = '-' else: sign = '+' try: f = self.decimal_context.create_decimal( s ) except decimal.InvalidOperation: f = self.decimal_context.create_decimal( 'NaN' ) except decimal.Overflow: if sign=='-': f = self.decimal_context.create_decimal( '-Infinity' ) else: f = self.decimal_context.create_decimal( 'Infinity' ) else: if sign=='-': f = f.copy_negate() return f def make_float( self, s, sign='+' ): """Converts a string into a float or decimal value.""" if decimal and self.float_type == NUMBER_DECIMAL: return self.make_decimal( s, sign ) if s.startswith('-') or s.startswith('+'): sign = s[0] s = s[1:] elif isinstance(sign, (int,long)): if sign < 0: sign = '-' else: sign = '+' try: f = float(s) except ValueError: f = nan else: if sign=='-': f *= -1 return f @property def leading_zero_radix(self): """The radix to be used for numbers with leading zeros. 8 or 10 """ return self._leading_zero_radix @leading_zero_radix.setter def leading_zero_radix(self, radix): if isinstance(radix,basestring): try: radix = int(radix) except ValueError: radix = radix.lower() if radix=='octal' or radix=='oct' or radix=='8': radix = 8 elif radix=='decimal' or radix=='dec': radix = 10 if radix not in (8,10): raise ValueError("Radix must either be 8 (octal) or 10 (decimal)") self._leading_zero_radix = radix @property def leading_zero_radix_as_word(self): return {8:'octal', 10:'decimal'}[ self._leading_zero_radix ] def suppress_warnings(self): for name in self.warn_behaviors: self.set_behavior(name, 'allow') @property def allow_or_warn_behaviors(self): """Returns the set of all behaviors that are not forbidden (i.e., are allowed or warned).""" return self.allow_behaviors.union( self.warn_behaviors ) @property def strictness(self): return self._strictness @strictness.setter def strictness(self, strict): """Changes whether the options should be re-configured for strict JSON conformance.""" if strict == STRICTNESS_WARN: self._strictness = STRICTNESS_WARN self.set_all_warn() elif strict == STRICTNESS_STRICT or strict is True: self._strictness = STRICTNESS_STRICT self.keep_format = False self.set_all_forbid() self.warn_duplicate_keys() self.warn_zero_byte() self.warn_bom() self.warn_non_portable() elif strict == STRICTNESS_TOLERANT or strict is False: self._strictness = STRICTNESS_TOLERANT self.set_all_allow() self.warn_duplicate_keys() self.warn_zero_byte() self.warn_leading_zeros() self.leading_zero_radix = 8 self.warn_bom() self.allow_non_portable() else: raise ValueError("Unknown strictness options %r" % strict) self.allow_any_type_at_start() # ---------------------------------------------------------------------- # The main JSON encoder/decoder class. # ---------------------------------------------------------------------- class JSON(object): """An encoder/decoder for JSON data streams. Usually you will call the encode() or decode() methods. The other methods are for lower-level processing. Whether the JSON parser runs in strict mode (which enforces exact compliance with the JSON spec) or the more forgiving non-string mode can be affected by setting the 'strict' argument in the object's initialization; or by assigning True or False to the 'strict' property of the object. You can also adjust a finer-grained control over strictness by allowing or forbidding specific behaviors. You can get a list of all the available behaviors by accessing the 'behaviors' property. Likewise the 'allowed_behaviors' and 'forbidden_behaviors' list which behaviors will be allowed and which will not. Call the allow() or forbid() methods to adjust these. """ _string_quotes = '"\'' _escapes_json = { # character escapes in JSON '"': '"', '/': '/', '\\': '\\', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', } _escapes_js = { # character escapes in Javascript '"': '"', '\'': '\'', '\\': '\\', 'b': '\b', 'f': '\f', 'n': '\n', 'r': '\r', 't': '\t', 'v': '\v', '0': '\x00' } # Following is a reverse mapping of escape characters, used when we # output JSON. Only those escapes which are always safe (e.g., in JSON) # are here. It won't hurt if we leave questionable ones out. _rev_escapes = {'\n': '\\n', '\t': '\\t', '\b': '\\b', '\r': '\\r', '\f': '\\f', '"': '\\"', '\\': '\\\\' } _optional_rev_escapes = { '/': '\\/' } # only escaped if forced to do so json_syntax_characters = u"{}[]\"\\,:0123456789.-+abcdefghijklmnopqrstuvwxyz \t\n\r" all_hook_names = ('decode_number', 'decode_float', 'decode_object', 'decode_array', 'decode_string', 'encode_value', 'encode_dict', 'encode_dict_key', 'encode_sequence', 'encode_bytes', 'encode_default') def __init__(self, **kwargs): """Creates a JSON encoder/decoder object. You may pass encoding and decoding options either by passing an argument named 'json_options' with an instance of a json_options class; or with individual keyword/values that will be used to initialize a new json_options object. You can also set hooks by using keyword arguments using the hook name; e.g., encode_dict=my_hook_func. """ import sys, unicodedata, re kwargs = kwargs.copy() # Initialize hooks for hookname in self.all_hook_names: if hookname in kwargs: self.set_hook( hookname, kwargs[hookname] ) del kwargs[hookname] else: self.set_hook( hookname, None ) # Set options if 'json_options' in kwargs: self._options = kwargs['json_options'] else: self._options = json_options(**kwargs) # The following is a boolean map of the first 256 characters # which will quickly tell us which of those characters never # need to be escaped. self._asciiencodable = \ [32 <= c < 128 \ and not self._rev_escapes.has_key(chr(c)) \ and not unicodedata.category(unichr(c)) in ['Cc','Cf','Zl','Zp'] for c in range(0,256)] @property def options(self): """The optional behaviors used, e.g., the JSON conformance strictness. Returns an instance of json_options. """ return self._options def clear_hook(self, hookname): """Unsets a hook callback, as previously set with set_hook().""" self.set_hook( hookname, None ) def clear_all_hooks(self): """Unsets all hook callbacks, as previously set with set_hook().""" for hookname in self.all_hook_names: self.clear_hook( hookname ) def set_hook(self, hookname, function): """Sets a user-defined callback function used during encoding or decoding. The 'hookname' argument must be a string containing the name of one of the available hooks, listed below. The 'function' argument must either be None, which disables the hook, or a callable function. Hooks do not stack, if you set a hook it will undo any previously set hook. Netsted values. When decoding JSON that has nested objects or arrays, the decoding hooks will be called once for every corresponding value, even if nested. Generally the decoding hooks will be called from the inner-most value outward, and then left to right. Skipping. Any hook function may raise a JSONSkipHook exception if it does not wish to handle the particular invocation. This will have the effect of skipping the hook for that particular value, as if the hook was net set. AVAILABLE HOOKS: * decode_string Called for every JSON string literal with the Python-equivalent string value as an argument. Expects to get a Python object in return. * decode_float: Called for every JSON number that looks like a float (has a "."). The string representation of the number is passed as an argument. Expects to get a Python object in return. * decode_number: Called for every JSON number. The string representation of the number is passed as an argument. Expects to get a Python object in return. NOTE: If the number looks like a float and the 'decode_float' hook is set, then this hook will not be called. * decode_array: Called for every JSON array. A Python list is passed as the argument, and expects to get a Python object back. NOTE: this hook will get called for every array, even for nested arrays. * decode_object: Called for every JSON object. A Python dictionary is passed as the argument, and expects to get a Python object back. NOTE: this hook will get called for every object, even for nested objects. * encode_value: Called for every Python object which is to be encoded into JSON. * encode_dict: Called for every Python dictionary or anything that looks like a dictionary. * encode_dict_key: Called for every dictionary key. * encode_sequence: Called for every Python sequence-like object that is not a dictionary or string. This includes lists and tuples. * encode_bytes: Called for every Python bytes or bytearray type; or for any memoryview with a byte ('B') item type. (Python 3 only) * encode_default: Called for any Python type which can not otherwise be converted into JSON, even after applying any other encoding hooks. """ if hookname in self.all_hook_names: att = hookname + '_hook' if function != None and not callable(function): raise ValueError("Hook %r must be None or a callable function" % hookname) setattr( self, att, function ) else: raise ValueError("Unknown hook name %r" % hookname) def has_hook(self, hook_name): if not hook_name or hook_name not in self.all_hook_names: return False hook = getattr( self, hook_name + '_hook' ) return callable(hook) def call_hook(self, hook_name, input_object, position=None, *args, **kwargs): """Wrapper function to invoke a user-supplied hook function. This will capture any exceptions raised by the hook and do something appropriate with it. """ import sys if hook_name not in self.all_hook_names: raise AttributeError("No such hook %r" % hook_name) hook = getattr( self, hook_name + '_hook' ) if not callable(hook): raise TypeError("Hook is not callable: %r" % (hook,)) try: rval = hook( input_object, *args, **kwargs ) except JSONSkipHook: raise # Do nothing except Exception, err: exc_info = sys.exc_info() if hook_name.startswith('encode_'): ex_class = JSONEncodeHookError else: ex_class = JSONDecodeHookError if isinstance(err, JSONStopProcessing): severity = 'fatal' else: severity = 'error' newerr = ex_class( hook_name, exc_info, input_object, *args, position=position, severity=severity ) # Simulate Python 3's: "raise X from Y" exception chaining newerr.__cause__ = err newerr.__traceback__ = exc_info[2] raise newerr return rval def isws(self, c): """Determines if the given character is considered as white space. Note that Javscript is much more permissive on what it considers to be whitespace than does JSON. Ref. ECMAScript section 7.2 """ if not self.options.unicode_whitespace: return c in ' \t\n\r' else: if not isinstance(c,unicode): c = unicode(c) if c in u' \t\n\r\f\v': return True import unicodedata return unicodedata.category(c) == 'Zs' def islineterm(self, c): """Determines if the given character is considered a line terminator. Ref. ECMAScript section 7.3 """ if c == '\r' or c == '\n': return True if c == u'\u2028' or c == u'\u2029': # unicodedata.category(c) in ['Zl', 'Zp'] return True return False def recover_parser(self, state): """Try to recover after a syntax error by locating the next "known" position.""" buf = state.buf buf.skipuntil( lambda c: c in ",:[]{}\"\';" or helpers.char_is_unicode_eol(c) ) stopchar = buf.peek() self.skipws(state) if buf.at_end: state.push_info("Could not recover parsing after previous error",position=buf.position) else: state.push_info("Recovering parsing after character %r" % stopchar, position=buf.position) return stopchar def decode_null(self, state): """Intermediate-level decoder for ECMAScript 'null' keyword. Takes a string and a starting index, and returns a Python None object and the index of the next unparsed character. """ buf = state.buf start_position = buf.position kw = buf.pop_identifier() if not kw or kw != 'null': state.push_error("Expected a 'null' keyword'", kw, position=start_position) else: state.stats.num_nulls += 1 return None def encode_undefined(self, state): """Produces the ECMAScript 'undefined' keyword.""" state.append('undefined') def encode_null(self, state): """Produces the JSON 'null' keyword.""" state.append('null') def decode_boolean(self, state): """Intermediate-level decode for JSON boolean literals. Takes a string and a starting index, and returns a Python bool (True or False) and the index of the next unparsed character. """ buf = state.buf start_position = buf.position kw = buf.pop_identifier() if not kw or kw not in ('true','false'): state.push_error("Expected a 'true' or 'false' keyword'", kw, position=start_position) else: state.stats.num_bools += 1 return (kw == 'true') def encode_boolean(self, bval, state): """Encodes the Python boolean into a JSON Boolean literal.""" state.append( 'true' if bool(bval) else 'false' ) def decode_number(self, state): """Intermediate-level decoder for JSON numeric literals. Takes a string and a starting index, and returns a Python suitable numeric type and the index of the next unparsed character. The returned numeric type can be either of a Python int, long, or float. In addition some special non-numbers may also be returned such as nan, inf, and neginf (technically which are Python floats, but have no numeric value.) Ref. ECMAScript section 8.5. """ buf = state.buf self.skipws(state) start_position = buf.position # Use external number parser hook if available if self.has_hook('decode_number') or self.has_hook('decode_float'): c = buf.peek() if c and c in '-+0123456789.': # First chars for a number-like value buf.save_position() nbr = buf.pop_while_in( '-+0123456789abcdefABCDEF' 'NaN' 'Infinity.' ) if '.' in nbr and self.has_hook('decode_float'): hook_name = 'decode_float' elif self.has_hook('decode_number'): hook_name = 'decode_number' else: hook_name = None if hook_name: try: val = self.call_hook( hook_name, nbr, position=start_position ) except JSONSkipHook: pass except JSONError, err: state.push_exception(err) val = undefined else: buf.clear_saved_position() return val # Hook didn't handle it, restore old position buf.restore_position() # Detect initial sign character(s) sign = +1 sign_count = 0 sign_saw_plus = False sign_saw_ws = False c = buf.peek() while c and c in '+-': if c == '-': sign = sign * -1 elif c == '+': sign_saw_plus = True sign_count += 1 buf.skip() if self.skipws_nocomments(state) > 0: sign_saw_ws = True c = buf.peek() if sign_count > 1 or sign_saw_plus: state.push_cond( self.options.all_numeric_signs, 'Numbers may only have a single "-" as a sign prefix', position=start_position) if sign_saw_ws: state.push_error('Spaces may not appear between a +/- number sign and the digits', position=start_position) # Check for ECMAScript symbolic non-numbers if not c: state.push_error('Missing numeric value after sign', position=start_position) self.recover_parser(state) self.stats.num_undefineds += 1 return undefined elif c.isalpha() or c in '_$': kw = buf.popwhile( lambda c: c.isalnum() or c in '_$' ) if kw == 'NaN': state.push_cond( self.options.non_numbers, 'NaN literals are not allowed in strict JSON', position=start_position) state.stats.num_nans += 1 return self.options.nan elif kw == 'Infinity': state.push_cond( self.options.non_numbers, 'Infinity literals are not allowed in strict JSON', position=start_position) state.stats.num_infinities += 1 if sign < 0: return self.options.neginf else: return self.options.inf else: state.push_error('Unknown numeric value keyword', kw, position=start_position) return undefined # Check for radix-prefixed numbers elif c == '0' and (buf.peek(1) in [u'x',u'X']): # ----- HEX NUMBERS 0x123 prefix = buf.popstr(2) digits = buf.popwhile( helpers.is_hex_digit ) state.push_cond( self.options.hex_numbers, 'Hexadecimal literals are not allowed in strict JSON', prefix+digits, position=start_position ) if len(digits)==0: state.push_error('Hexadecimal number is invalid', position=start_position) self.recover_parser(state) return undefined ival = helpers.decode_hex( digits ) state.update_integer_stats( ival, sign=sign, position=start_position ) n = state.options.make_int( ival, sign, number_format=NUMBER_FORMAT_HEX ) return n elif c == '0' and (buf.peek(1) in [u'o','O']): # ----- NEW-STYLE OCTAL NUMBERS 0o123 prefix = buf.popstr(2) digits = buf.popwhile( helpers.is_octal_digit ) state.push_cond( self.options.octal_numbers, "Octal literals are not allowed in strict JSON", prefix+digits, position=start_position ) if len(digits)==0: state.push_error("Octal number is invalid", position=start_position) self.recover_parser(state) return undefined ival = helpers.decode_octal( digits ) state.update_integer_stats( ival, sign=sign, position=start_position ) n = state.options.make_int( ival, sign, number_format=NUMBER_FORMAT_OCTAL ) return n elif c == '0' and (buf.peek(1) in [u'b','B']): # ----- NEW-STYLE BINARY NUMBERS 0b1101 prefix = buf.popstr(2) digits = buf.popwhile( helpers.is_binary_digit ) state.push_cond( self.options.binary_numbers, "Binary literals are not allowed in strict JSON", prefix+digits, position=start_position ) if len(digits)==0: state.push_error("Binary number is invalid", position=start_position) self.recover_parser(state) return undefined ival = helpers.decode_binary( digits ) state.update_integer_stats( ival, sign=sign, position=start_position ) n = state.options.make_int( ival, sign, number_format=NUMBER_FORMAT_BINARY ) return n else: # ----- DECIMAL OR LEGACY-OCTAL NUMBER. 123, 0123 # General syntax is: \d+[\.\d+][e[+-]?\d+] number = buf.popwhile( lambda c: c in '0123456789.+-eE' ) imax = len(number) if imax == 0: state.push_error('Missing numeric value', position=start_position) has_leading_zero = False units_digits = [] # digits making up whole number portion fraction_digits = [] # digits making up fractional portion exponent_digits = [] # digits making up exponent portion (excluding sign) esign = '+' # sign of exponent sigdigits = 0 # number of significant digits (approximate) saw_decimal_point = False saw_exponent = False # Break number into parts in a first pass...use a mini state machine in_part = 'units' for i, c in enumerate(number): if c == '.': if in_part != 'units': state.push_error('Bad number', number, position=start_position) self.recover_parser(state) return undefined in_part = 'fraction' saw_decimal_point = True elif c in 'eE': if in_part == 'exponent': state.push_error('Bad number', number, position=start_position) self.recover_parser(state) return undefined in_part = 'exponent' saw_exponent = True elif c in '+-': if in_part != 'exponent' or exponent_digits: state.push_error('Bad number', number, position=start_position) self.recover_parser(state) return undefined esign = c else: #digit if in_part == 'units': units_digits.append( c ) elif in_part == 'fraction': fraction_digits.append( c ) elif in_part == 'exponent': exponent_digits.append( c ) units_s = ''.join(units_digits) fraction_s = ''.join(fraction_digits) exponent_s = ''.join(exponent_digits) # Basic syntax rules checking is_integer = not (saw_decimal_point or saw_exponent) if not units_s and not fraction_s: state.push_error('Bad number', number, position=start_position) self.recover_parser(state) return undefined if saw_decimal_point and not fraction_s: state.push_cond( self.options.trailing_decimal_point, 'Bad number, decimal point must be followed by at least one digit', number, position=start_position) fraction_s = '0' if saw_exponent and not exponent_s: state.push_error('Bad number, exponent is missing', number, position=start_position) self.recover_parser(state) return undefined if not units_s: state.push_cond( self.options.initial_decimal_point, 'Bad number, decimal point must be preceded by at least one digit', number, position=start_position) units = '0' elif len(units_s) > 1 and units_s[0] == '0': has_leading_zero = True if self.options.is_forbid_leading_zeros: state.push_cond( self.options.leading_zeros, 'Numbers may not have extra leading zeros', number, position=start_position) elif self.options.is_warn_leading_zeros: state.push_cond( self.options.leading_zeros, 'Numbers may not have leading zeros; interpreting as %s' \ % self.options.leading_zero_radix_as_word, number, position=start_position) # Estimate number of significant digits sigdigits = len( (units_s + fraction_s).replace('0',' ').strip() ) # Handle legacy octal integers. if has_leading_zero and is_integer and self.options.leading_zero_radix == 8: # ----- LEGACY-OCTAL 0123 try: ival = helpers.decode_octal( units_s ) except ValueError: state.push_error('Bad number, not a valid octal value', number, position=start_position) self.recover_parser(state) return self.options.nan # undefined state.update_integer_stats( ival, sign=sign, position=start_position ) n = state.options.make_int( ival, sign, number_format=NUMBER_FORMAT_LEGACYOCTAL ) return n # Determine the exponential part if exponent_s: try: exponent = int(exponent_s) except ValueError: state.push_error('Bad number, bad exponent', number, position=start_position) self.recover_parser(state) return undefined if esign == '-': exponent = - exponent else: exponent = 0 # Try to make an int/long first. if not saw_decimal_point and exponent >= 0: # ----- A DECIMAL INTEGER ival = int(units_s) if exponent != 0: ival *= 10**exponent state.update_integer_stats( ival, sign=sign, position=start_position ) n = state.options.make_int( ival, sign ) else: # ----- A FLOATING-POINT NUMBER try: if exponent < float_minexp or exponent > float_maxexp or sigdigits > float_sigdigits: n = state.options.make_decimal( number, sign ) else: n = state.options.make_float( number, sign ) except ValueError as err: state.push_error('Bad number, %s' % err.message, number, position=start_position) n = undefined else: state.update_float_stats( n, sign=sign, position=start_position ) return n def encode_number(self, n, state): """Encodes a Python numeric type into a JSON numeric literal. The special non-numeric values of float('nan'), float('inf') and float('-inf') are translated into appropriate JSON literals. Note that Python complex types are not handled, as there is no ECMAScript equivalent type. """ if isinstance(n, complex): if n.imag: raise JSONEncodeError('Can not encode a complex number that has a non-zero imaginary part',n) n = n.real if isinstance(n, json_int): state.append( n.json_format() ) return if isinstance(n, (int,long)): state.append( str(n) ) return if decimal and isinstance(n, decimal.Decimal): if n.is_nan(): # Could be 'NaN' or 'sNaN' state.append( 'NaN' ) elif n.is_infinite(): if n.is_signed(): state.append( '-Infinity' ) else: state.append( 'Infinity' ) else: s = str(n).lower() if 'e' not in s and '.' not in s: s = s + '.0' state.append( s ) return global nan, inf, neginf if n is nan: state.append( 'NaN' ) elif n is inf: state.append( 'Infinity' ) elif n is neginf: state.append( '-Infinity' ) elif isinstance(n, float): # Check for non-numbers. # In python nan == inf == -inf, so must use repr() to distinguish reprn = repr(n).lower() if ('inf' in reprn and '-' in reprn) or n == neginf: state.append( '-Infinity' ) elif 'inf' in reprn or n is inf: state.append( 'Infinity' ) elif 'nan' in reprn or n is nan: state.append( 'NaN' ) else: # A normal float. state.append( repr(n) ) else: raise TypeError('encode_number expected an integral, float, or decimal number type',type(n)) def decode_string(self, state): """Intermediate-level decoder for JSON string literals. Takes a string and a starting index, and returns a Python string (or unicode string) and the index of the next unparsed character. """ buf = state.buf self.skipws(state) quote = buf.peek() if quote == '"': pass elif quote == "'": state.push_cond( self.options.single_quoted_strings, 'String literals must use double quotation marks in strict JSON' ) else: state.push_error('String literal must be properly quoted') return undefined string_position = buf.position buf.skip() if self.options.is_forbid_js_string_escapes: escapes = self._escapes_json else: escapes = self._escapes_js ccallowed = not self.options.is_forbid_control_char_in_string chunks = [] _append = chunks.append # Used to track the last seen high-surrogate character high_surrogate = None highsur_position = None # Used to track if errors occured so we don't keep reporting multiples had_lineterm_error = False # Start looping character by character until the final quotation mark saw_final_quote = False should_stop = False while not saw_final_quote and not should_stop: if buf.at_end: state.push_error("String literal is not terminated", outer_position=string_position, context='String') break c = buf.peek() # Make sure a high surrogate is immediately followed by a low surrogate if high_surrogate: if 0xdc00 <= ord(c) <= 0xdfff: low_surrogate = buf.pop() try: uc = helpers.surrogate_pair_as_unicode( high_surrogate, low_surrogate ) except ValueError as err: state.push_error( 'Illegal Unicode surrogate pair', (high_surrogate, low_surrogate), position=highsur_position, outer_position=string_position, context='String') should_stop = state.should_stop uc = u'\ufffd' # replacement char _append( uc ) high_surrogate = None highsur_position = None continue # ==== NEXT CHAR elif buf.peekstr(2) != '\\u': state.push_error('High unicode surrogate must be followed by a low surrogate', position=highsur_position, outer_position=string_position, context='String') should_stop = state.should_stop _append( u'\ufffd' ) # replacement char high_surrogate = None highsur_position = None if c == quote: buf.skip() # skip over closing quote saw_final_quote = True break elif c == '\\': # Escaped character escape_position = buf.position buf.skip() # skip over backslash c = buf.peek() if not c: state.push_error('Escape in string literal is incomplete', position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop break elif helpers.is_octal_digit(c): # Handle octal escape codes first so special \0 doesn't kick in yet. # Follow Annex B.1.2 of ECMAScript standard. if '0' <= c <= '3': maxdigits = 3 else: maxdigits = 2 digits = buf.popwhile( helpers.is_octal_digit, maxchars=maxdigits ) n = helpers.decode_octal(digits) if n == 0: state.push_cond( self.options.zero_byte, 'Zero-byte character (U+0000) in string may not be universally safe', "\\"+digits, position=escape_position, outer_position=string_position, context='String') else: # n != 0 state.push_cond( self.options.octal_numbers, "JSON does not allow octal character escapes other than \"\\0\"", "\\"+digits, position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop if n < 128: _append( chr(n) ) else: _append( helpers.safe_unichr(n) ) elif escapes.has_key(c): buf.skip() _append( escapes[c] ) elif c == 'u' or c == 'x': buf.skip() esc_opener = '\\' + c esc_closer = '' if c == 'u': if buf.peek() == '{': buf.skip() esc_opener += '{' esc_closer = '}' maxdigits = None state.push_cond( self.options.extended_unicode_escapes, "JSON strings do not allow \\u{...} escapes", position=escape_position, outer_position=string_position, context='String') else: maxdigits = 4 else: # c== 'x' state.push_cond( self.options.js_string_escapes, "JSON strings may not use the \\x hex-escape", position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop maxdigits = 2 digits = buf.popwhile( helpers.is_hex_digit, maxchars=maxdigits ) if esc_closer: if buf.peek() != esc_closer: state.push_error( "Unicode escape sequence is missing closing \'%s\'" % esc_closer, esc_opener+digits, position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop else: buf.skip() esc_sequence = esc_opener + digits + esc_closer if not digits: state.push_error('numeric character escape sequence is truncated', esc_sequence, position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop codepoint = 0xfffd # replacement char else: if maxdigits and len(digits) != maxdigits: state.push_error('escape sequence has too few hexadecimal digits', esc_sequence, position=escape_position, outer_position=string_position, context='String') codepoint = helpers.decode_hex( digits ) if codepoint > 0x10FFFF: state.push_error( 'Unicode codepoint is beyond U+10FFFF', esc_opener+digits+esc_closer, position=escape_position, outer_position=string_position, context='String') codepoint = 0xfffd # replacement char if high_surrogate: # Decode surrogate pair and clear high surrogate low_surrogate = unichr(codepoint) try: uc = helpers.surrogate_pair_as_unicode( high_surrogate, low_surrogate ) except ValueError as err: state.push_error( 'Illegal Unicode surrogate pair', (high_surrogate, low_surrogate), position=highsur_position, outer_position=string_position, context='String') should_stop = state.should_stop uc = u'\ufffd' # replacement char _append( uc ) high_surrogate = None highsur_position = None elif codepoint < 128: # ASCII chars always go in as a str if codepoint==0: state.push_cond( self.options.zero_byte, 'Zero-byte character (U+0000) in string may not be universally safe', position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop _append( chr(codepoint) ) elif 0xd800 <= codepoint <= 0xdbff: # high surrogate high_surrogate = unichr(codepoint) # remember until we get to the low surrogate highsur_position = escape_position.copy() elif 0xdc00 <= codepoint <= 0xdfff: # low surrogate state.push_error('Low unicode surrogate must be proceeded by a high surrogate', position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop _append( u'\ufffd' ) # replacement char else: # Other chars go in as a unicode char _append( helpers.safe_unichr(codepoint) ) else: # Unknown escape sequence state.push_cond( self.options.nonescape_characters, 'String escape code is not allowed in strict JSON', '\\'+c, position=escape_position, outer_position=string_position, context='String') should_stop = state.should_stop _append( c ) buf.skip() elif ord(c) <= 0x1f: # A control character if ord(c) == 0: state.push_cond( self.options.zero_byte, 'Zero-byte character (U+0000) in string may not be universally safe', position=buf.position, outer_position=string_position, context='String') should_stop = state.should_stop if self.islineterm(c): if not had_lineterm_error: state.push_error('Line terminator characters must be escaped inside string literals', 'U+%04X'%ord(c), position=buf.position, outer_position=string_position, context='String') should_stop = state.should_stop had_lineterm_error = True _append( c ) buf.skip() elif ccallowed: _append( c ) buf.skip() else: state.push_error('Control characters must be escaped inside JSON string literals', 'U+%04X'%ord(c), position=buf.position, outer_position=string_position, context='String') should_stop = state.should_stop buf.skip() elif 0xd800 <= ord(c) <= 0xdbff: # a raw high surrogate high_surrogate = buf.pop() # remember until we get to the low surrogate highsur_position = buf.position.copy() else: # A normal character; not an escape sequence or end-quote. # Find a whole sequence of "safe" characters so we can append them # all at once rather than one a time, for speed. chunk = buf.popwhile( lambda c: c not in helpers.unsafe_string_chars and c != quote ) if not chunk: _append( c ) buf.skip() else: _append( chunk ) # Check proper string termination if high_surrogate: state.push_error('High unicode surrogate must be followed by a low surrogate', position=highsur_position, outer_position=string_position, context='String') _append( u'\ufffd' ) # replacement char high_surrogate = None highsur_position = None if not saw_final_quote: state.push_error('String literal is not terminated with a quotation mark', position=buf.position, outer_position=string_position, context='String') if state.should_stop: return undefined # Compose the python string and update stats s = ''.join( chunks ) state.update_string_stats( s, position=string_position ) # Call string hook if self.has_hook('decode_string'): try: s = self.call_hook( 'decode_string', s, position=string_position ) except JSONSkipHook: pass except JSONError, err: state.push_exception(err) s = undefined return s def encode_string(self, s, state): """Encodes a Python string into a JSON string literal. """ # Must handle instances of UserString specially in order to be # able to use ord() on it's simulated "characters". Also # convert Python2 'str' types to unicode strings first. import unicodedata, sys import UserString py2strenc = self.options.py2str_encoding if isinstance(s, UserString.UserString): def tochar(c): c2 = c.data if py2strenc and not isinstance(c2,unicode): return c2.decode( py2strenc ) else: return c2 elif py2strenc and not isinstance(s,unicode): s = s.decode( py2strenc ) tochar = None else: # Could use "lambda c:c", but that is too slow. So we set to None # and use an explicit if test inside the loop. tochar = None chunks = [] chunks.append('"') revesc = self._rev_escapes optrevesc = self._optional_rev_escapes asciiencodable = self._asciiencodable always_escape = state.options.always_escape_chars encunicode = state.escape_unicode_test i = 0 imax = len(s) while i < imax: if tochar: c = tochar(s[i]) else: c = s[i] cord = ord(c) if cord < 256 and asciiencodable[cord] and isinstance(encunicode, bool) \ and not (always_escape and c in always_escape): # Contiguous runs of plain old printable ASCII can be copied # directly to the JSON output without worry (unless the user # has supplied a custom is-encodable function). j = i i += 1 while i < imax: if tochar: c = tochar(s[i]) else: c = s[i] cord = ord(c) if cord < 256 and asciiencodable[cord] \ and not (always_escape and c in always_escape): i += 1 else: break chunks.append( unicode(s[j:i]) ) elif revesc.has_key(c): # Has a shortcut escape sequence, like "\n" chunks.append(revesc[c]) i += 1 elif cord <= 0x1F: # Always unicode escape ASCII-control characters chunks.append(r'\u%04x' % cord) i += 1 elif 0xD800 <= cord <= 0xDFFF: # A raw surrogate character! # This should ONLY happen in "narrow" Python builds # where (sys.maxunicode == 65535) as Python itself # uses UTF-16. But for "wide" Python builds, a raw # surrogate should never happen. handled_raw_surrogates = False if sys.maxunicode == 0xFFFF and 0xD800 <= cord <= 0xDBFF and (i+1) < imax: # In a NARROW Python, output surrogate pair as-is hsurrogate = cord i += 1 if tochar: c = tochar(s[i]) else: c = s[i] cord = ord(c) i += 1 if 0xDC00 <= cord <= 0xDFFF: lsurrogate = cord chunks.append(r'\u%04x\u%04x' % (hsurrogate,lsurrogate)) handled_raw_surrogates = True if not handled_raw_surrogates: cname = 'U+%04X' % cord raise JSONEncodeError('can not include or escape a Unicode surrogate character',cname) elif cord <= 0xFFFF: # Other BMP Unicode character if always_escape and c in always_escape: doesc = True elif unicodedata.category( c ) in ['Cc','Cf','Zl','Zp']: doesc = True elif callable(encunicode): doesc = encunicode( c ) else: doesc = encunicode if doesc: if optrevesc.has_key(c): chunks.append(optrevesc[c]) else: chunks.append(r'\u%04x' % cord) else: chunks.append( c ) i += 1 else: # ord(c) >= 0x10000 # Non-BMP Unicode if always_escape and c in always_escape: doesc = True elif unicodedata.category( c ) in ['Cc','Cf','Zl','Zp']: doesc = True elif callable(encunicode): doesc = encunicode( c ) else: doesc = encunicode if doesc: for surrogate in helpers.unicode_as_surrogate_pair(c): chunks.append(r'\u%04x' % ord(surrogate)) else: chunks.append( c ) i += 1 chunks.append('"') state.append( ''.join( chunks ) ) def decode_identifier(self, state, identifier_as_string=False): """Decodes an identifier/keyword. """ buf = state.buf self.skipws(state) start_position = buf.position obj = None kw = buf.pop_identifier() if not kw: state.push_error("Expected an identifier", position=start_position) elif kw == 'null': obj = None state.stats.num_nulls += 1 elif kw == 'true': obj = True state.stats.num_bools += 1 elif kw == 'false': obj = False state.stats.num_bools += 1 elif kw == 'undefined': state.push_cond( self.options.undefined_values, "Strict JSON does not allow the 'undefined' keyword", kw, position=start_position) obj = undefined state.stats.num_undefineds += 1 elif kw == 'NaN' or kw == 'Infinity': state.push_cond( self.options.non_numbers, "%s literals are not allowed in strict JSON" % kw, kw, position=start_position) if self.has_hook('decode_float'): try: val = self.call_hook( 'decode_float', kw, position=start_position ) except JSONSkipHook: pass except JSONError, err: state.push_exception(err) return undefined else: return val elif self.has_hook('decode_number'): try: val = self.call_hook( 'decode_number', kw, position=start_position ) except JSONSkipHook: pass except JSONError, err: state.push_exception(err) return undefined else: return val if kw == 'NaN': state.stats.num_nans += 1 obj = state.options.nan else: state.stats.num_infinities += 1 obj = state.options.inf else: # Convert unknown identifiers into strings if identifier_as_string: if kw in helpers.javascript_reserved_words: state.push_warning( "Identifier is a JavaScript reserved word", kw, position=start_position) state.push_cond( self.options.identifier_keys, "JSON does not allow identifiers to be used as strings", kw, position=start_position) state.stats.num_identifiers += 1 obj = self.decode_javascript_identifier( kw ) else: state.push_error("Unknown identifier", kw, position=start_position) obj = undefined state.stats.num_identifiers += 1 return obj def skip_comment(self, state): """Skips an ECMAScript comment, either // or /* style. The contents of the comment are returned as a string, as well as the index of the character immediately after the comment. """ buf = state.buf uniws = self.options.unicode_whitespace s = buf.peekstr(2) if s != '//' and s != '/*': return None state.push_cond( self.options.comments, 'Comments are not allowed in strict JSON' ) start_position = buf.position buf.skip(2) multiline = (s == '/*') saw_close = False while not buf.at_end: if multiline: if buf.peekstr(2) == '*/': buf.skip(2) saw_close = True break elif buf.peekstr(2) == '/*': state.push_error('Multiline /* */ comments may not nest', outer_position=start_position, context='Comment') else: if buf.at_eol( uniws ): buf.skip_to_next_line( uniws ) saw_close = True break buf.pop() if not saw_close and multiline: state.push_error('Comment was never terminated', outer_position=start_position, context='Comment') state.stats.num_comments += 1 def skipws_nocomments(self, state): """Skips whitespace (will not allow comments). """ return state.buf.skipws( not self.options.is_forbid_unicode_whitespace ) def skipws(self, state): """Skips all whitespace, including comments and unicode whitespace Takes a string and a starting index, and returns the index of the next non-whitespace character. If the 'skip_comments' behavior is True and not running in strict JSON mode, then comments will be skipped over just like whitespace. """ buf = state.buf uniws = not self.options.unicode_whitespace while not buf.at_end: c = buf.peekstr(2) if c == '/*' or c == '//': cmt = self.skip_comment( state ) elif buf.at_ws( uniws ): buf.skipws( uniws ) else: break def decode_composite(self, state): """Intermediate-level JSON decoder for composite literal types (array and object). """ if state.should_stop: return None buf = state.buf self.skipws(state) opener = buf.peek() if opener not in '{[': state.push_error('Composite data must start with "[" or "{"') return None start_position = buf.position buf.skip() if opener == '[': isdict = False closer = ']' obj = [] else: isdict = True closer = '}' if state.options.sort_keys == SORT_PRESERVE and _OrderedDict: obj = _OrderedDict() else: obj = {} num_items = 0 self.skipws(state) c = buf.peek() if c == closer: # empty composite buf.skip() done = True else: saw_value = False # set to false at beginning and after commas done = False while not done and not buf.at_end and not state.should_stop: self.skipws(state) c = buf.peek() if c == '': break # will report error futher down because done==False elif c == ',': if not saw_value: # no preceeding value, an elided (omitted) element if isdict: state.push_error('Can not omit elements of an object (dictionary)', outer_position=start_position, context='Object') else: state.push_cond( self.options.omitted_array_elements, 'Can not omit elements of an array (list)', outer_position=start_position, context='Array') obj.append( undefined ) if state.stats: state.stats.num_undefineds += 1 buf.skip() # skip over comma saw_value = False continue elif c == closer: if not saw_value: if isdict: state.push_cond( self.options.trailing_comma, 'Strict JSON does not allow a final comma in an object (dictionary) literal', outer_position=start_position, context='Object') else: state.push_cond( self.options.trailing_comma, 'Strict JSON does not allow a final comma in an array (list) literal', outer_position=start_position, context='Array') buf.skip() # skip over closer done = True break elif c in ']}': if isdict: cdesc='Object' else: cdesc='Array' state.push_error("Expected a '%c' but saw '%c'" % (closer,c), outer_position=start_position, context=cdesc) done = True break if state.should_stop: break # Decode the item/value value_position = buf.position if isdict: val = self.decodeobj(state, identifier_as_string=True) else: val = self.decodeobj(state, identifier_as_string=False) if val is syntax_error: recover_c = self.recover_parser(state) if recover_c not in ':': continue if state.should_stop: break if saw_value: # Two values without a separating comma if isdict: cdesc='Object' else: cdesc='Array' state.push_error('Values must be separated by a comma', position=value_position, outer_position=start_position, context=cdesc) saw_value = True self.skipws(state) if state.should_stop: break if isdict: skip_item = False key = val # Ref 11.1.5 key_position = value_position if not helpers.isstringtype(key): if helpers.isnumbertype(key): state.push_cond( self.options.nonstring_keys, 'JSON only permits string literals as object properties (keys)', position=key_position, outer_position=start_position, context='Object') else: state.push_error('Object properties (keys) must be string literals, numbers, or identifiers', position=key_position, outer_position=start_position, context='Object') skip_item = True c = buf.peek() if c != ':': state.push_error('Missing value for object property, expected ":"', position=value_position, outer_position=start_position, context='Object') buf.skip() # skip over colon self.skipws(state) rval = self.decodeobj(state) self.skipws(state) if not skip_item: if key in obj: state.push_cond( self.options.duplicate_keys, 'Object contains duplicate key', key, position=key_position, outer_position=start_position, context='Object') if key == '': state.push_cond( self.options.non_portable, 'Using an empty string "" as an object key may not be portable', position=key_position, outer_position=start_position, context='Object') obj[ key ] = rval num_items += 1 else: # islist obj.append( val ) num_items += 1 # end while if state.stats: if isdict: state.stats.max_items_in_object = max(state.stats.max_items_in_object, num_items) else: state.stats.max_items_in_array = max(state.stats.max_items_in_array, num_items) if state.should_stop: return obj # Make sure composite value is properly terminated if not done: if isdict: state.push_error('Object literal (dictionary) is not terminated', outer_position=start_position, context='Object') else: state.push_error('Array literal (list) is not terminated', outer_position=start_position, context='Array') # Update stats and run hooks if isdict: state.stats.num_objects += 1 if self.has_hook('decode_object'): try: obj = self.call_hook( 'decode_object', obj, position=start_position ) except JSONSkipHook: pass except JSONError, err: state.push_exception(err) obj = undefined else: state.stats.num_arrays += 1 if self.has_hook('decode_array'): try: obj = self.call_hook( 'decode_array', obj, position=start_position ) except JSONSkipHook: pass except JSONError, err: state.push_exception(err) obj = undefined return obj def decode_javascript_identifier(self, name): """Convert a JavaScript identifier into a Python string object. This method can be overriden by a subclass to redefine how JavaScript identifiers are turned into Python objects. By default this just converts them into strings. """ return name def decodeobj(self, state, identifier_as_string=False, at_document_start=False): """Intermediate-level JSON decoder. Takes a string and a starting index, and returns a two-tuple consting of a Python object and the index of the next unparsed character. If there is no value at all (empty string, etc), then None is returned instead of a tuple. """ buf = state.buf obj = None self.skipws(state) if buf.at_end: state.push_error('Unexpected end of input') c = buf.peek() if c in '{[': state.cur_depth += 1 try: state.update_depth_stats() obj = self.decode_composite(state) finally: state.cur_depth -= 1 else: if at_document_start: state.push_cond( self.options.any_type_at_start, 'JSON document must start with an object or array type only' ) if c in self._string_quotes: obj = self.decode_string(state) elif c.isdigit() or c in '.+-': obj = self.decode_number(state) elif c.isalpha() or c in'_$': obj = self.decode_identifier(state, identifier_as_string=identifier_as_string) else: state.push_error('Can not decode value starting with character %r' % c) buf.skip() self.recover_parser(state) obj = syntax_error return obj def decode(self, txt, encoding=None, return_errors=False, return_stats=False): """Decodes a JSON-encoded string into a Python object. The 'return_errors' parameter controls what happens if the input JSON has errors in it. * False: the first error will be raised as a Python exception. If there are no errors then the corresponding Python object will be returned. * True: the return value is always a 2-tuple: (object, error_list) """ import sys state = decode_state( options=self.options ) # Prepare the input state.set_input( txt, encoding=encoding ) # Do the decoding if not state.has_errors: self.__sanity_check_start( state ) if not state.has_errors: try: self._do_decode( state ) # DECODE! except JSONException, err: state.push_exception( err ) except Exception, err: # Mainly here to catch maximum recursion depth exceeded e2 = sys.exc_info() raise newerr = JSONDecodeError("An unexpected failure occured", severity='fatal', position=state.buf.position) newerr.__cause__ = err newerr.__traceback__ = e2[2] state.push_exception( newerr ) if return_stats and state.buf: state.stats.num_excess_whitespace = state.buf.num_ws_skipped state.stats.total_chars = state.buf.position.char_position # Handle the errors result_type = _namedtuple('json_results',['object','errors','stats']) if return_errors: if return_stats: return result_type(state.obj, state.errors, state.stats) else: return result_type(state.obj, state.errors, None) else: # Don't cause warnings to raise an error errors = [err for err in state.errors if err.severity in ('fatal','error')] if errors: raise errors[0] if return_stats: return result_type(state.obj, None, state.stats) else: return state.obj def __sanity_check_start(self, state): """Check that the document seems sane by looking at the first couple characters. Check that the decoding seems sane. Per RFC 4627 section 3: "Since the first two characters of a JSON text will always be ASCII characters [RFC0020], ..." [WAS removed from RFC 7158, but still valid via the grammar.] This check is probably not necessary, but it allows us to raise a suitably descriptive error rather than an obscure syntax error later on. Note that the RFC requirements of two ASCII characters seems to be an incorrect statement as a JSON string literal may have as it's first character any unicode character. Thus the first two characters will always be ASCII, unless the first character is a quotation mark. And in non-strict mode we can also have a few other characters too. """ is_sane = True unitxt = state.buf.peekstr(2) if len(unitxt) >= 2: first, second = unitxt[:2] if first in self._string_quotes: pass # second can be anything inside string literal else: if ((ord(first) < 0x20 or ord(first) > 0x7f) or \ (ord(second) < 0x20 or ord(second) > 0x7f)) and \ (not self.isws(first) and not self.isws(second)): # Found non-printable ascii, must check unicode # categories to see if the character is legal. # Only whitespace, line and paragraph separators, # and format control chars are legal here. import unicodedata catfirst = unicodedata.category(unicode(first)) catsecond = unicodedata.category(unicode(second)) if catfirst not in ('Zs','Zl','Zp','Cf') or \ catsecond not in ('Zs','Zl','Zp','Cf'): state.push_fatal( 'The input is gibberish, is the Unicode encoding correct?' ) return is_sane def _do_decode(self, state): """This is the internal function that does the JSON decoding. Called by the decode() method, after it has performed any Unicode decoding, etc. """ buf = state.buf self.skipws(state) if buf.at_end: state.push_error('No value to decode') else: if state.options.decimal_context: dec_ctx = decimal.localcontext( state.options.decimal_context ) else: dec_ctx = _dummy_context_manager with dec_ctx: state.obj = self.decodeobj(state, at_document_start=True ) if not state.should_stop: # Make sure there's nothing at the end self.skipws(state) if not buf.at_end: state.push_error('Unexpected text after end of JSON value') def _classify_for_encoding( self, obj ): import datetime c = 'other' if obj is None: c = 'null' elif obj is undefined: c = 'undefined' elif isinstance(obj,bool): c = 'bool' elif isinstance(obj, (int,long,float,complex)) or\ (decimal and isinstance(obj, decimal.Decimal)): c = 'number' elif isinstance(obj, basestring) or helpers.isstringtype(obj): c = 'string' else: if isinstance(obj,dict): c = 'dict' elif isinstance(obj,tuple) and hasattr(obj,'_asdict') and callable(obj._asdict): # Have a named tuple enc_nt = self.options.encode_namedtuple_as_object if enc_nt and (enc_nt is True or (callable(enc_nt) and enc_nt(obj))): c = 'namedtuple' else: c = 'sequence' elif isinstance(obj, (list,tuple,set,frozenset)): c = 'sequence' elif hasattr(obj,'iterkeys') or (hasattr(obj,'__getitem__') and hasattr(obj,'keys')): c = 'dict' elif isinstance(obj, datetime.datetime): # Check datetime before date because it is a subclass! c = 'datetime' elif isinstance(obj, datetime.date): c = 'date' elif isinstance(obj, datetime.time): c = 'time' elif isinstance(obj, datetime.timedelta): c = 'timedelta' elif _py_major >= 3 and isinstance(obj,(bytes,bytearray)): c = 'bytes' elif _py_major >= 3 and isinstance(obj,memoryview): c = 'memoryview' elif _enum is not None and isinstance(obj,_enum): c = 'enum' else: c = 'other' return c def encode(self, obj, encoding=None ): """Encodes the Python object into a JSON string representation. This method will first attempt to encode an object by seeing if it has a json_equivalent() method. If so than it will call that method and then recursively attempt to encode the object resulting from that call. Next it will attempt to determine if the object is a native type or acts like a squence or dictionary. If so it will encode that object directly. Finally, if no other strategy for encoding the object of that type exists, it will call the encode_default() method. That method currently raises an error, but it could be overridden by subclasses to provide a hook for extending the types which can be encoded. """ import sys, codecs # Make a fresh encoding state state = encode_state( self.options ) # Find the codec to use. CodecInfo will be in 'cdk' and name in 'encoding'. # # Also set the state's 'escape_unicode_test' property which is used to # determine what characters to \u-escape. if encoding is None: cdk = None elif isinstance(encoding, codecs.CodecInfo): cdk = encoding encoding = cdk.name else: cdk = helpers.lookup_codec( encoding ) if not cdk: raise JSONEncodeError('no codec available for character encoding',encoding) if self.options.escape_unicode and callable(self.options.escape_unicode): # User-supplied repertoire test function state.escape_unicode_test = self.options.escape_unicode else: if self.options.escape_unicode==True or not cdk or cdk.name.lower() == 'ascii': # ASCII, ISO8859-1, or and Unknown codec -- \u escape anything not ASCII state.escape_unicode_test = lambda c: ord(c) >= 0x80 elif cdk.name == 'iso8859-1': state.escape_unicode_test = lambda c: ord(c) >= 0x100 elif cdk and cdk.name.lower().startswith('utf'): # All UTF-x encodings can do the whole Unicode repertoire, so # do nothing special. state.escape_unicode_test = False else: # An unusual codec. We need to test every character # to see if it is in the codec's repertoire to determine # if we should \u escape that character. enc_func = cdk.encode def escape_unicode_hardway( c ): try: enc_func( c ) except UnicodeEncodeError: return True else: return False state.escape_unicode_test = escape_unicode_hardway # Make sure the encoding is not degenerate: it can encode the minimal # number of characters needed by the JSON syntax rules. if encoding is not None: try: output, nchars = cdk.encode( JSON.json_syntax_characters ) except UnicodeError, err: raise JSONEncodeError("Output encoding %s is not sufficient to encode JSON" % cdk.name) # Do the JSON encoding! self._do_encode( obj, state ) if not self.options.encode_compactly: state.append('\n') unitxt = state.combine() # Do the final Unicode encoding if encoding is None: output = unitxt else: try: output, nchars = cdk.encode( unitxt ) except UnicodeEncodeError, err: # Re-raise as a JSONDecodeError e2 = sys.exc_info() newerr = JSONEncodeError("a Unicode encoding error occurred") # Simulate Python 3's: "raise X from Y" exception chaining newerr.__cause__ = err newerr.__traceback__ = e2[2] raise newerr return output def _do_encode(self, obj, state): """Internal encode function.""" obj_classification = self._classify_for_encoding( obj ) if self.has_hook('encode_value'): orig_obj = obj try: obj = self.call_hook( 'encode_value', obj ) except JSONSkipHook: pass if obj is not orig_obj: prev_cls = obj_classification obj_classification = self._classify_for_encoding( obj ) if obj_classification != prev_cls: # Got a different type of object, re-encode again self._do_encode( obj, state ) return if hasattr(obj, 'json_equivalent'): success = self.encode_equivalent( obj, state ) if success: return if obj_classification == 'null': self.encode_null( state ) elif obj_classification == 'undefined': if not self.options.is_forbid_undefined_values: self.encode_undefined( state ) else: raise JSONEncodeError('strict JSON does not permit "undefined" values') elif obj_classification == 'bool': self.encode_boolean( obj, state ) elif obj_classification == 'number': try: self.encode_number( obj, state ) except JSONEncodeError, err1: # Bad number, probably a complex with non-zero imaginary part. # Let the default encoders take a shot at encoding. try: self.try_encode_default(obj, state) except Exception, err2: # Default handlers couldn't deal with it, re-raise original exception. raise err1 elif obj_classification == 'string': self.encode_string( obj, state ) elif obj_classification == 'enum': # Python 3.4 enum.Enum self.encode_enum( obj, state ) elif obj_classification == 'datetime': # Python datetime.datetime self.encode_datetime( obj, state ) elif obj_classification == 'date': # Python datetime.date self.encode_date( obj, state ) elif obj_classification == 'time': # Python datetime.time self.encode_time( obj, state ) elif obj_classification == 'timedelta': # Python datetime.time self.encode_timedelta( obj, state ) else: # Anything left is probably composite, or an unconvertable type. self.encode_composite( obj, state ) def encode_enum(self, val, state): """Encode a Python Enum value into JSON.""" eas = self.options.encode_enum_as if eas == 'qname': self.encode_string( str(obj), state ) elif eas == 'value': self._do_encode( obj.value, state ) else: # eas == 'name' self.encode_string( obj.name, state ) def encode_date(self, dt, state): fmt = self.options.date_format if not fmt or fmt == 'iso': fmt = '%Y-%m-%d' self.encode_string( dt.strftime(fmt), state ) def encode_datetime(self, dt, state): fmt = self.options.datetime_format is_iso = not fmt or fmt == 'iso' if is_iso: if dt.microsecond == 0: fmt = '%Y-%m-%dT%H:%M:%S%z' else: fmt = '%Y-%m-%dT%H:%M:%S.%f%z' s = dt.strftime(fmt) if is_iso and s.endswith('-00:00') or s.endswith('+00:00'): s = s[:-6] + 'Z' # Change UTC to use 'Z' notation self.encode_string( s, state ) def encode_time(self, t, state): fmt = self.options.datetime_format is_iso = not fmt or fmt == 'iso' if is_iso: if dt.microsecond == 0: fmt = 'T%H:%M:%S%z' else: fmt = 'T%H:%M:%S.%f%z' s = t.strftime(fmt) if is_iso and s.endswith('-00:00') or s.endswith('+00:00'): s = s[:-6] + 'Z' # Change UTC to use 'Z' notation self.encode_string( s, state ) def encode_timedelta(self, td, state): fmt = self.options.timedelta_format if not fmt or fmt == 'iso': s = helpers.format_timedelta_iso( td ) elif fmt == 'hms': s = str(td) else: raise ValueError("Unknown timedelta_format %r" % fmt) self.encode_string( s, state ) def encode_composite(self, obj, state, obj_classification=None): """Encodes just composite objects: dictionaries, lists, or sequences. Basically handles any python type for which iter() can create an iterator object. This method is not intended to be called directly. Use the encode() method instead. """ import sys if not obj_classification: obj_classification = self._classify_for_encoding(obj) # Convert namedtuples to dictionaries if obj_classification == 'namedtuple': obj = obj._asdict() obj_classification = 'dict' # Convert 'unsigned byte' memory views into plain bytes if obj_classification == 'memoryview' and obj.format == 'B': obj = obj.tobytes() obj_classification = 'bytes' # Run hooks hook_name = None if obj_classification == 'dict': hook_name = 'encode_dict' elif obj_classification == 'sequence': hook_name = 'encode_sequence' elif obj_classification == 'bytes': hook_name = 'encode_bytes' if self.has_hook(hook_name): try: new_obj = self.call_hook( hook_name, obj ) except JSONSkipHook: pass else: if new_obj is not obj: obj = new_obj prev_cls = obj_classification obj_classification = self._classify_for_encoding( obj ) if obj_classification != prev_cls: # Transformed to a different kind of object, call # back to the general encode() method. self._do_encode( obj, state ) return # Else, fall through # At his point we have decided to do with an object or an array isdict = (obj_classification == 'dict') # Get iterator it = None if isdict and hasattr(obj,'iterkeys'): try: it = obj.iterkeys() except AttributeError: pass else: try: it = iter(obj) except TypeError: pass # Convert each member to JSON if it is not None: # Try to get length, but don't fail if we can't try: numitems = len(obj) except TypeError: numitems = 0 # Output the opening bracket or brace compactly = self.options.encode_compactly if not compactly: indent0 = self.options.indentation_for_level( state.nest_level ) indent = self.options.indentation_for_level( state.nest_level+1 ) spaces_after_opener = '' if isdict: opener = '{' closer = '}' if compactly: dictcolon = ':' else: dictcolon = ' : ' else: opener = '[' closer = ']' if not compactly: #opener = opener + ' ' spaces_after_opener = self.options.spaces_to_next_indent_level(subtract=len(opener)) state.append( opener ) state.append( spaces_after_opener ) # Now iterate through all the items and collect their representations parts = [] # Collects each of the members part_keys = [] # For dictionary key sorting, tuples (key,index) try: # while not StopIteration part_idx = 0 while True: obj2 = it.next() part_idx += 1 # Note, will start counting at 1 if obj2 is obj: raise JSONEncodeError('trying to encode an infinite sequence',obj) if isdict: obj3 = obj[obj2] # Dictionary key is in obj2 and value in obj3. # Let any hooks transform the key. if self.has_hook('encode_value'): try: newobj = self.call_hook( 'encode_value', obj2 ) except JSONSkipHook: pass else: obj2 = newobj if self.has_hook('encode_dict_key'): try: newkey = self.call_hook( 'encode_dict_key', obj2 ) except JSONSkipHook: pass else: obj2 = newkey # Check JSON restrictions on key types if not helpers.isstringtype(obj2): if helpers.isnumbertype(obj2): if not self.options.is_allow_nonstring_keys: raise JSONEncodeError('object properties (dictionary keys) must be strings in strict JSON',obj2) else: raise JSONEncodeError('object properties (dictionary keys) can only be strings or numbers in ECMAScript',obj2) part_keys.append( (obj2, part_idx-1) ) # Encode this item in the sequence and put into item_chunks substate = state.make_substate() self._do_encode( obj2, substate ) if isdict: substate.append( dictcolon ) substate2 = substate.make_substate() self._do_encode( obj3, substate2 ) substate.join_substate( substate2 ) parts.append( substate ) # Next item iteration except StopIteration: pass # Sort dictionary keys if isdict: srt = self.options.sort_keys if srt == SORT_PRESERVE: if _OrderedDict and isinstance(obj,_OrderedDict): srt = SORT_NONE # Will keep order else: srt = SORT_SMART if not srt or srt in (SORT_NONE, SORT_PRESERVE): srt = None elif callable(srt): part_keys.sort( key=(lambda t: (srt(t[0]),t[0])) ) elif srt == SORT_SMART: part_keys.sort( key=(lambda t: (smart_sort_transform(t[0]),t[0])) ) elif srt == SORT_ALPHA_CI: part_keys.sort( key=(lambda t: (unicode(t[0]).upper(),t[0])) ) elif srt or srt == SORT_ALPHA: part_keys.sort( key=(lambda t: unicode(t[0])) ) # Now make parts match the new sort order if srt is not None: parts = [parts[pk[1]] for pk in part_keys] if compactly: sep = ',' elif len(parts) <= self.options.max_items_per_line: sep = ', ' else: #state.append(spaces_after_opener) state.append('\n' + indent) sep = ',\n' + indent for pnum, substate in enumerate(parts): if pnum > 0: state.append( sep ) state.join_substate( substate ) if not compactly: if numitems > self.options.max_items_per_line: state.append('\n' + indent0) else: state.append(' ') state.append(closer) # final '}' or ']' else: # Can't create an iterator for the object self.try_encode_default( obj, state ) def encode_equivalent( self, obj, state ): """This method is used to encode user-defined class objects. The object being encoded should have a json_equivalent() method defined which returns another equivalent object which is easily JSON-encoded. If the object in question has no json_equivalent() method available then None is returned instead of a string so that the encoding will attempt the next strategy. If a caller wishes to disable the calling of json_equivalent() methods, then subclass this class and override this method to just return None. """ if hasattr(obj, 'json_equivalent') \ and callable(getattr(obj,'json_equivalent')): obj2 = obj.json_equivalent() if obj2 is obj: # Try to prevent careless infinite recursion raise JSONEncodeError('object has a json_equivalent() method that returns itself',obj) self._do_encode( obj2, state ) return True else: return False def try_encode_default( self, obj, state ): orig_obj = obj if self.has_hook('encode_default'): try: obj = self.call_hook( 'encode_default', obj ) except JSONSkipHook: pass else: if obj is not orig_obj: # Hook made a transformation, re-encode it return self._do_encode( obj, state ) # End of the road. raise JSONEncodeError('can not encode object into a JSON representation',obj) # ------------------------------ def encode( obj, encoding=None, **kwargs ): r"""Encodes a Python object into a JSON-encoded string. * 'strict' (Boolean, default False) If 'strict' is set to True, then only strictly-conforming JSON output will be produced. Note that this means that some types of values may not be convertable and will result in a JSONEncodeError exception. * 'compactly' (Boolean, default True) If 'compactly' is set to True, then the resulting string will have all extraneous white space removed; if False then the string will be "pretty printed" with whitespace and indentation added to make it more readable. * 'encode_namedtuple_as_object' (Boolean or callable, default True) If True, then objects of type namedtuple, or subclasses of 'tuple' that have an _asdict() method, will be encoded as an object rather than an array. If can also be a predicate function that takes a namedtuple object as an argument and returns True or False. * 'indent_amount' (Integer, default 2) The number of spaces to output for each indentation level. If 'compactly' is True then indentation is ignored. * 'indent_limit' (Integer or None, default None) If not None, then this is the maximum limit of indentation levels, after which further indentation spaces are not inserted. If None, then there is no limit. CONCERNING CHARACTER ENCODING: The 'encoding' argument should be one of: * None - The return will be a Unicode string. * encoding_name - A string which is the name of a known encoding, such as 'UTF-8' or 'ascii'. * codec - A CodecInfo object, such as as found by codecs.lookup(). This allows you to use a custom codec as well as those built into Python. If an encoding is given (either by name or by codec), then the returned value will be a byte array (Python 3), or a 'str' string (Python 2); which represents the raw set of bytes. Otherwise, if encoding is None, then the returned value will be a Unicode string. The 'escape_unicode' argument is used to determine which characters in string literals must be \u escaped. Should be one of: * True -- All non-ASCII characters are always \u escaped. * False -- Try to insert actual Unicode characters if possible. * function -- A user-supplied function that accepts a single unicode character and returns True or False; where True means to \u escape that character. Regardless of escape_unicode, certain characters will always be \u escaped. Additionaly any characters not in the output encoding repertoire for the encoding codec will be \u escaped as well. """ # Do the JSON encoding j = JSON( **kwargs ) output = j.encode( obj, encoding ) return output def decode( txt, encoding=None, **kwargs ): """Decodes a JSON-encoded string into a Python object. == Optional arguments == * 'encoding' (string, default None) This argument provides a hint regarding the character encoding that the input text is assumed to be in (if it is not already a unicode string type). If set to None then autodetection of the encoding is attempted (see discussion above). Otherwise this argument should be the name of a registered codec (see the standard 'codecs' module). * 'strict' (Boolean, default False) If 'strict' is set to True, then those strings that are not entirely strictly conforming to JSON will result in a JSONDecodeError exception. * 'return_errors' (Boolean, default False) Controls the return value from this function. If False, then only the Python equivalent object is returned on success, or an error will be raised as an exception. If True then a 2-tuple is returned: (object, error_list). The error_list will be an empty list [] if the decoding was successful, otherwise it will be a list of all the errors encountered. Note that it is possible for an object to be returned even if errors were encountered. * 'return_stats' (Boolean, default False) Controls whether statistics about the decoded JSON document are returns (and instance of decode_statistics). If True, then the stats object will be added to the end of the tuple returned. If return_errors is also set then a 3-tuple is returned, otherwise a 2-tuple is returned. * 'write_errors' (Boolean OR File-like object, default False) Controls what to do with errors. - If False, then the first decoding error is raised as an exception. - If True, then errors will be printed out to sys.stderr. - If a File-like object, then errors will be printed to that file. The write_errors and return_errors arguments can be set independently. * 'filename_for_errors' (string or None) Provides a filename to be used when writting error messages. * 'allow_xxx', 'warn_xxx', and 'forbid_xxx' (Booleans) These arguments allow for fine-adjustments to be made to the 'strict' argument, by allowing or forbidding specific syntaxes. There are many of these arguments, named by replacing the "xxx" with any number of possible behavior names (See the JSON class for more details). Each of these will allow (or forbid) the specific behavior, after the evaluation of the 'strict' argument. For example, if strict=True then by also passing 'allow_comments=True' then comments will be allowed. If strict=False then forbid_comments=True will allow everything except comments. Unicode decoding: ----------------- The input string can be either a python string or a python unicode string (or a byte array in Python 3). If it is already a unicode string, then it is assumed that no character set decoding is required. However, if you pass in a non-Unicode text string (a Python 2 'str' type or a Python 3 'bytes' or 'bytearray') then an attempt will be made to auto-detect and decode the character encoding. This will be successful if the input was encoded in any of UTF-8, UTF-16 (BE or LE), or UTF-32 (BE or LE), and of course plain ASCII works too. Note though that if you know the character encoding, then you should convert to a unicode string yourself, or pass it the name of the 'encoding' to avoid the guessing made by the auto detection, as with python_object = demjson.decode( input_bytes, encoding='utf8' ) Callback hooks: --------------- You may supply callback hooks by using the hook name as the named argument, such as: decode_float=decimal.Decimal See the hooks documentation on the JSON.set_hook() method. """ import sys # Initialize the JSON object return_errors = False return_stats = False write_errors = False filename_for_errors = None write_stats = False kwargs = kwargs.copy() todel = [] for kw,val in kwargs.items(): if kw == "return_errors": return_errors = bool(val) todel.append(kw) elif kw == 'return_stats': return_stats = bool(val) todel.append(kw) elif kw == "write_errors": write_errors = val todel.append(kw) elif kw == "filename_for_errors": filename_for_errors = val todel.append(kw) elif kw == "write_stats": write_stats = val todel.append(kw) # next keyword argument for kw in todel: del kwargs[kw] j = JSON( **kwargs ) # Now do the actual JSON decoding result = j.decode( txt, encoding=encoding, return_errors=(return_errors or write_errors), return_stats=(return_stats or write_stats) ) if write_errors: import sys if write_errors is True: write_errors = sys.stderr for err in result.errors: write_errors.write( err.pretty_description(filename=filename_for_errors) + "\n" ) if write_stats: import sys if write_stats is True: write_stats = sys.stderr if result.stats: write_stats.write( "%s----- Begin JSON statistics\n" % filename_for_errors ) write_stats.write( result.stats.pretty_description( prefix=" | " ) ) write_stats.write( "%s----- End of JSON statistics\n" % filename_for_errors ) return result def encode_to_file( filename, obj, encoding='utf-8', overwrite=False, **kwargs ): """Encodes a Python object into JSON and writes into the given file. If no encoding is given, then UTF-8 will be used. See the encode() function for a description of other possible options. If the file already exists and the 'overwrite' option is not set to True, then the existing file will not be overwritten. (Note, there is a subtle race condition in the check so there are possible conditions in which a file may be overwritten) """ import os, errno if not encoding: encoding = 'utf-8' if not isinstance(filename,basestring) or not filename: raise TypeError("Expected a file name") if not overwrite and os.path.exists(filename): raise IOError(errno.EEXIST, "File exists: %r" % filename) jsondata = encode( obj, encoding=encoding, **kwargs ) try: fp = open(filename, 'wb') except Exception: raise else: try: fp.write( jsondata ) finally: fp.close() def decode_file( filename, encoding=None, **kwargs ): """Decodes JSON found in the given file. See the decode() function for a description of other possible options. """ if isinstance(filename,basestring): try: fp = open(filename, 'rb') except Exception: raise else: try: jsondata = fp.read() finally: fp.close() else: raise TypeError("Expected a file name") return decode( jsondata, encoding=encoding, **kwargs ) # ====================================================================== class jsonlint(object): """This class contains most of the logic for the "jsonlint" command. You generally create an instance of this class, to defined the program's environment, and then call the main() method. A simple wrapper to turn this into a script might be: import sys, demjson if __name__ == '__main__': lint = demjson.jsonlint( sys.argv[0] ) return lint.main( sys.argv[1:] ) """ _jsonlint_usage = r"""Usage: %(program_name)s [ ...] [--] inputfile.json ... With no input filename, or "-", it will read from standard input. The return status will be 0 if the file is conforming JSON (per the RFC 7159 specification), or non-zero otherwise. GENERAL OPTIONS: -v | --verbose Show details of lint checking -q | --quiet Don't show any output (except for reformatting) STRICTNESS OPTIONS (WARNINGS AND ERRORS): -W | --tolerant Be tolerant, but warn about non-conformance (default) -s | --strict Be strict in what is considered conforming JSON -S | --nonstrict Be tolerant in what is considered conforming JSON --allow=... -\ --warn=... |-- These options let you pick specific behaviors. --forbid=... -/ Use --help-behaviors for more STATISTICS OPTIONS: --stats Show statistics about JSON document REFORMATTING OPTIONS: -f | --format Reformat the JSON text (if conforming) to stdout -F | --format-compactly Reformat the JSON simlar to -f, but do so compactly by removing all unnecessary whitespace -o filename | --output filename The filename to which reformatted JSON is to be written. Without this option the standard output is used. --[no-]keep-format Try to preserve numeric radix, e.g., hex, octal, etc. --html-safe Escape characters that are not safe to embed in HTML/XML. --sort How to sort object/dictionary keys, is one of: %(sort_options_help)s --indent tabs | Number of spaces to use per indentation level, or use tab characters if "tabs" given. UNICODE OPTIONS: -e codec | --encoding=codec Set both input and output encodings --input-encoding=codec Set the input encoding --output-encoding=codec Set the output encoding These options set the character encoding codec (e.g., "ascii", "utf-8", "utf-16"). The -e will set both the input and output encodings to the same thing. The output encoding is used when reformatting with the -f or -F options. Unless set, the input encoding is guessed and the output encoding will be "utf-8". OTHER OPTIONS: --recursion-limit=nnn Set the Python recursion limit to number --leading-zero-radix=8|10 The radix to use for numbers with leading zeros. 8=octal, 10=decimal. REFORMATTING / PRETTY-PRINTING: When reformatting JSON with -f or -F, output is only produced if the input passed validation. By default the reformatted JSON will be written to standard output, unless the -o option was given. The default output codec is UTF-8, unless an encoding option is provided. Any Unicode characters will be output as literal characters if the encoding permits, otherwise they will be \u-escaped. You can use "--output-encoding ascii" to force all Unicode characters to be escaped. MORE INFORMATION: Use '%(program_name)s --version [-v]' to see versioning information. Use '%(program_name)s --copyright' to see author and copyright details. Use '%(program_name)s [-W|-s|-S] --help-behaviors' for help on specific checks. %(program_name)s is distributed as part of the "demjson" Python module. See %(homepage)s """ SUCCESS_FAIL = 'E' SUCCESS_WARNING = 'W' SUCCESS_OK = 'OK' def __init__(self, program_name='jsonlint', stdin=None, stdout=None, stderr=None ): """Create an instance of a "jsonlint" program. You can optionally pass options to define the program's environment: * program_name - the name of the program, usually sys.argv[0] * stdin - the file object to use for input, default sys.stdin * stdout - the file object to use for outut, default sys.stdout * stderr - the file object to use for error output, default sys.stderr After creating an instance, you typically call the main() method. """ import os, sys self.program_path = program_name self.program_name = os.path.basename(program_name) if stdin: self.stdin = stdin else: self.stdin = sys.stdin if stdout: self.stdout = stdout else: self.stdout = sys.stdout if stderr: self.stderr = stderr else: self.stderr = sys.stderr @property def usage(self): """A multi-line string containing the program usage instructions. """ sorthelp = '\n'.join([ " %12s - %s" % (sm, sd) for sm, sd in sorted(sorting_methods.items()) if sm != SORT_NONE ]) return self._jsonlint_usage % {'program_name':self.program_name, 'homepage':__homepage__, 'sort_options_help': sorthelp } def _lintcheck_data( self, jsondata, verbose_fp=None, reformat=False, show_stats=False, input_encoding=None, output_encoding=None, escape_unicode=True, pfx='', jsonopts=None ): global decode, encode success = self.SUCCESS_FAIL reformatted = None if show_stats: stats_fp = verbose_fp else: stats_fp = None try: results = decode( jsondata, encoding=input_encoding, return_errors=True, return_stats=True, write_errors=verbose_fp, write_stats=stats_fp, filename_for_errors=pfx, json_options=jsonopts ) except JSONError, err: success = self.SUCCESS_FAIL if verbose_fp: verbose_fp.write('%s%s\n' % (pfx, err.pretty_description()) ) except Exception, err: success = self.SUCCESS_FAIL if verbose_fp: verbose_fp.write('%s%s\n' % (pfx, str(err) )) else: errors = [err for err in results.errors if err.severity in ('fatal','error')] warnings = [err for err in results.errors if err.severity in ('warning',)] if errors: success = self.SUCCESS_FAIL elif warnings: success = self.SUCCESS_WARNING else: success = self.SUCCESS_OK if reformat: encopts = jsonopts.copy() encopts.strictness = STRICTNESS_TOLERANT if reformat == 'compactly': encopts.encode_compactly = True else: encopts.encode_compactly = False reformatted = encode(results.object, encoding=output_encoding, json_options=encopts) return (success, reformatted) def _lintcheck( self, filename, output_filename, verbose=False, reformat=False, show_stats=False, input_encoding=None, output_encoding=None, escape_unicode=True, jsonopts=None ): import sys verbose_fp = None if not filename or filename == "-": pfx = ': ' jsondata = self.stdin.read() if verbose: verbose_fp = self.stderr else: pfx = '%s: ' % filename try: fp = open( filename, 'rb' ) jsondata = fp.read() fp.close() except IOError, err: self.stderr.write('%s: %s\n' % (pfx, str(err)) ) return self.SUCCESS_FAIL if verbose: verbose_fp = self.stdout success, reformatted = self._lintcheck_data( jsondata, verbose_fp=verbose_fp, reformat=reformat, show_stats=show_stats, input_encoding=input_encoding, output_encoding=output_encoding, pfx=pfx, jsonopts=jsonopts ) if success != self.SUCCESS_FAIL and reformat: if output_filename: try: fp = open( output_filename, 'wb' ) fp.write( reformatted ) except IOError, err: self.stderr.write('%s: %s\n' % (pfx, str(err)) ) success = False else: if hasattr(sys.stdout,'buffer'): # To write binary data rather than strings self.stdout.buffer.write( reformatted ) else: self.stdout.write( reformatted ) elif success == self.SUCCESS_OK and verbose_fp: verbose_fp.write('%sok\n' % pfx) elif success == self.SUCCESS_WARNING and verbose_fp: verbose_fp.write('%sok, with warnings\n' % pfx) elif verbose_fp: verbose_fp.write("%shas errors\n" % pfx) return success def main( self, argv ): """The main routine for program "jsonlint". Should be called with sys.argv[1:] as its sole argument. Note sys.argv[0] which normally contains the program name should not be passed to main(); instead this class itself is initialized with sys.argv[0]. Use "--help" for usage syntax, or consult the 'usage' member. """ import sys, os, getopt, unicodedata recursion_limit = None success = True verbose = 'auto' # one of 'auto', True, or False reformat = False show_stats = False output_filename = None input_encoding = None output_encoding = 'utf-8' kwoptions = { # Will be used to initialize json_options "sort_keys": SORT_SMART, "strict": STRICTNESS_WARN, "keep_format": True, "decimal_context": 100, } try: opts, args = getopt.getopt( argv, 'vqfFe:o:sSW', ['verbose','quiet', 'format','format-compactly', 'stats', 'output', 'strict','nonstrict','warn', 'html-safe','xml-safe', 'encoding=', 'input-encoding=','output-encoding=', 'sort=', 'recursion-limit=', 'leading-zero-radix=', 'keep-format', 'no-keep-format', 'indent=', 'indent-amount=', 'indent-limit=', 'indent-tab-width=', 'max-items-per-line=', 'allow=', 'warn=', 'forbid=', 'deny=', 'help', 'help-behaviors', 'version','copyright'] ) except getopt.GetoptError, err: self.stderr.write( "Error: %s. Use \"%s --help\" for usage information.\n" \ % (err.msg, self.program_name) ) return 1 # Set verbose before looking at any other options for opt, val in opts: if opt in ('-v', '--verbose'): verbose=True # Process all options for opt, val in opts: if opt in ('-h', '--help'): self.stdout.write( self.usage ) return 0 elif opt == '--help-behaviors': self.stdout.write(""" BEHAVIOR OPTIONS: These set of options let you control which checks are to be performed. They may be turned on or off by listing them as arguments to one of the options --allow, --warn, or --forbid ; for example: %(program_name)s --allow comments,hex-numbers --forbid duplicate-keys """ % {"program_name":self.program_name}) self.stdout.write("The default shown is for %s mode\n\n" % kwoptions['strict']) self.stdout.write('%-7s %-25s %s\n' % ("Default", "Behavior_name", "Description")) self.stdout.write('-'*7 + ' ' + '-'*25 + ' ' + '-'*50 + '\n') j = json_options( **kwoptions ) for behavior in sorted(j.all_behaviors): v = j.get_behavior( behavior ) desc = j.describe_behavior( behavior ) self.stdout.write('%-7s %-25s %s\n' % (v.lower(), behavior.replace('_','-'), desc)) return 0 elif opt == '--version': self.stdout.write( '%s (%s) version %s (%s)\n' \ % (self.program_name, __name__, __version__, __date__) ) if verbose == True: self.stdout.write( 'demjson from %r\n' % (__file__,) ) if verbose == True: self.stdout.write( 'Python version: %s\n' % (sys.version.replace('\n',' '),) ) self.stdout.write( 'This python implementation supports:\n' ) self.stdout.write( ' * Max unicode: U+%X\n' % (sys.maxunicode,) ) self.stdout.write( ' * Unicode version: %s\n' % (unicodedata.unidata_version,) ) self.stdout.write( ' * Floating-point significant digits: %d\n' % (float_sigdigits,) ) self.stdout.write( ' * Floating-point max 10^exponent: %d\n' % (float_maxexp,) ) if str(0.0)==str(-0.0): szero = 'No' else: szero = 'Yes' self.stdout.write( ' * Floating-point has signed-zeros: %s\n' % (szero,) ) if decimal: has_dec = 'Yes' else: has_dec = 'No' self.stdout.write( ' * Decimal (bigfloat) support: %s\n' % (has_dec,) ) return 0 elif opt == '--copyright': self.stdout.write( "%s is distributed as part of the \"demjson\" python package.\n" \ % (self.program_name,) ) self.stdout.write( "See %s\n\n\n" % (__homepage__,) ) self.stdout.write( __credits__ ) return 0 elif opt in ('-v', '--verbose'): verbose = True elif opt in ('-q', '--quiet'): verbose = False elif opt in ('-s', '--strict'): kwoptions['strict'] = STRICTNESS_STRICT kwoptions['keep_format'] = False elif opt in ('-S', '--nonstrict'): kwoptions['strict'] = STRICTNESS_TOLERANT elif opt in ('-W', '--tolerant'): kwoptions['strict'] = STRICTNESS_WARN elif opt in ('-f', '--format'): reformat = True kwoptions['encode_compactly'] = False elif opt in ('-F', '--format-compactly'): kwoptions['encode_compactly'] = True reformat = 'compactly' elif opt in ('--stats',): show_stats=True elif opt in ('-o', '--output'): output_filename = val elif opt in ('-e','--encoding'): input_encoding = val output_encoding = val escape_unicode = False elif opt in ('--output-encoding'): output_encoding = val escape_unicode = False elif opt in ('--input-encoding'): input_encoding = val elif opt in ('--html-safe','--xml-safe'): kwoptions['html_safe'] = True elif opt in ('--allow','--warn','--forbid'): action = opt[2:] if action in kwoptions: kwoptions[action] += "," + val else: kwoptions[action] = val elif opt in ('--keep-format',): kwoptions['keep_format']=True elif opt in ('--no-keep-format',): kwoptions['keep_format']=False elif opt == '--leading-zero-radix': kwoptions['leading_zero_radix'] = val elif opt in ('--indent', '--indent-amount'): if val in ('tab','tabs'): kwoptions['indent_amount'] = 8 kwoptions['indent_tab_width'] = 8 else: try: kwoptions['indent_amount'] = int(val) except ValueError: self.stderr.write("Indentation amount must be a number\n") return 1 elif opt == 'indent-tab-width': try: kwoptions['indent_tab_width'] = int(val) except ValueError: self.stderr.write("Indentation tab width must be a number\n") return 1 elif opt == '--max-items-per-line': try: kwoptions['max_items_per_line'] = int(val) except ValueError: self.stderr.write("Max items per line must be a number\n") return 1 elif opt == '--sort': val = val.lower() if val == 'alpha': kwoptions['sort_keys'] = SORT_ALPHA elif val == 'alpha_ci': kwoptions['sort_keys'] = SORT_ALPHA_CI elif val == 'preserve': kwoptions['sort_keys'] = SORT_PRESERVE else: kwoptions['sort_keys'] = SORT_SMART elif opt == '--recursion-limit': try: recursion_limit = int(val) except ValueError: self.stderr.write("Recursion limit must be a number: %r\n" % val) return 1 else: max_limit = 100000 old_limit = sys.getrecursionlimit() if recursion_limit > max_limit: self.stderr.write("Recursion limit must be a number between %d and %d\n" % (old_limit,max_limit)) return 1 elif recursion_limit > old_limit: sys.setrecursionlimit( recursion_limit ) else: self.stderr.write('Unknown option %r\n' % opt) return 1 # Make the JSON options kwoptions['decimal_context'] = 100 jsonopts = json_options( **kwoptions ) # Now decode each file... if not args: args = [None] for fn in args: try: rc = self._lintcheck( fn, output_filename=output_filename, verbose=verbose, reformat=reformat, show_stats=show_stats, input_encoding=input_encoding, output_encoding=output_encoding, jsonopts=jsonopts ) if rc != self.SUCCESS_OK: # Warnings or errors should result in failure. If # checking multiple files, do not change a # previous error back to ok. success = False except KeyboardInterrupt, err: sys.stderr.write("\njsonlint interrupted!\n") sys.exit(1) if not success: return 1 return 0 # end file