import sys
from unittest import TestCase

import simplejson as json
import simplejson.decoder
from simplejson.compat import b, PY3

class TestScanString(TestCase):
    # The bytes type is intentionally not used in most of these tests
    # under Python 3 because the decoder immediately coerces to str before
    # calling scanstring. In Python 2 we are testing the code paths
    # for both unicode and str.
    #
    # The reason this is done is because Python 3 would require
    # entirely different code paths for parsing bytes and str.
    #
    def test_py_scanstring(self):
        self._test_scanstring(simplejson.decoder.py_scanstring)

    def test_c_scanstring(self):
        if not simplejson.decoder.c_scanstring:
            return
        self._test_scanstring(simplejson.decoder.c_scanstring)

        self.assertTrue(isinstance(simplejson.decoder.c_scanstring('""', 0)[0], str))

    def _test_scanstring(self, scanstring):
        if sys.maxunicode == 65535:
            self.assertEqual(
                scanstring(u'"z\U0001d120x"', 1, None, True),
                (u'z\U0001d120x', 6))
        else:
            self.assertEqual(
                scanstring(u'"z\U0001d120x"', 1, None, True),
                (u'z\U0001d120x', 5))

        self.assertEqual(
            scanstring('"\\u007b"', 1, None, True),
            (u'{', 8))

        self.assertEqual(
            scanstring('"A JSON payload should be an object or array, not a string."', 1, None, True),
            (u'A JSON payload should be an object or array, not a string.', 60))

        self.assertEqual(
            scanstring('["Unclosed array"', 2, None, True),
            (u'Unclosed array', 17))

        self.assertEqual(
            scanstring('["extra comma",]', 2, None, True),
            (u'extra comma', 14))

        self.assertEqual(
            scanstring('["double extra comma",,]', 2, None, True),
            (u'double extra comma', 21))

        self.assertEqual(
            scanstring('["Comma after the close"],', 2, None, True),
            (u'Comma after the close', 24))

        self.assertEqual(
            scanstring('["Extra close"]]', 2, None, True),
            (u'Extra close', 14))

        self.assertEqual(
            scanstring('{"Extra comma": true,}', 2, None, True),
            (u'Extra comma', 14))

        self.assertEqual(
            scanstring('{"Extra value after close": true} "misplaced quoted value"', 2, None, True),
            (u'Extra value after close', 26))

        self.assertEqual(
            scanstring('{"Illegal expression": 1 + 2}', 2, None, True),
            (u'Illegal expression', 21))

        self.assertEqual(
            scanstring('{"Illegal invocation": alert()}', 2, None, True),
            (u'Illegal invocation', 21))

        self.assertEqual(
            scanstring('{"Numbers cannot have leading zeroes": 013}', 2, None, True),
            (u'Numbers cannot have leading zeroes', 37))

        self.assertEqual(
            scanstring('{"Numbers cannot be hex": 0x14}', 2, None, True),
            (u'Numbers cannot be hex', 24))

        self.assertEqual(
            scanstring('[[[[[[[[[[[[[[[[[[[["Too deep"]]]]]]]]]]]]]]]]]]]]', 21, None, True),
            (u'Too deep', 30))

        self.assertEqual(
            scanstring('{"Missing colon" null}', 2, None, True),
            (u'Missing colon', 16))

        self.assertEqual(
            scanstring('{"Double colon":: null}', 2, None, True),
            (u'Double colon', 15))

        self.assertEqual(
            scanstring('{"Comma instead of colon", null}', 2, None, True),
            (u'Comma instead of colon', 25))

        self.assertEqual(
            scanstring('["Colon instead of comma": false]', 2, None, True),
            (u'Colon instead of comma', 25))

        self.assertEqual(
            scanstring('["Bad value", truth]', 2, None, True),
            (u'Bad value', 12))

        for c in map(chr, range(0x00, 0x1f)):
            self.assertEqual(
                scanstring(c + '"', 0, None, False),
                (c, 2))
            self.assertRaises(
                ValueError,
                scanstring, c + '"', 0, None, True)

        self.assertRaises(ValueError, scanstring, '', 0, None, True)
        self.assertRaises(ValueError, scanstring, 'a', 0, None, True)
        self.assertRaises(ValueError, scanstring, '\\', 0, None, True)
        self.assertRaises(ValueError, scanstring, '\\u', 0, None, True)
        self.assertRaises(ValueError, scanstring, '\\u0', 0, None, True)
        self.assertRaises(ValueError, scanstring, '\\u01', 0, None, True)
        self.assertRaises(ValueError, scanstring, '\\u012', 0, None, True)
        self.assertRaises(ValueError, scanstring, '\\u0123', 0, None, True)
        if sys.maxunicode > 65535:
            self.assertRaises(ValueError,
                              scanstring, '\\ud834\\u"', 0, None, True)
            self.assertRaises(ValueError,
                              scanstring, '\\ud834\\x0123"', 0, None, True)

        self.assertRaises(json.JSONDecodeError, scanstring, '\\u-123"', 0, None, True)
        # SJ-PT-23-01: Invalid Handling of Broken Unicode Escape Sequences
        self.assertRaises(json.JSONDecodeError, scanstring, '\\u EDD"', 0, None, True)

    def test_issue3623(self):
        self.assertRaises(ValueError, json.decoder.scanstring, "xxx", 1,
                          "xxx")
        self.assertRaises(UnicodeDecodeError,
                          json.encoder.encode_basestring_ascii, b("xx\xff"))

    def test_overflow(self):
        # Python 2.5 does not have maxsize, Python 3 does not have maxint
        maxsize = getattr(sys, 'maxsize', getattr(sys, 'maxint', None))
        assert maxsize is not None
        self.assertRaises(OverflowError, json.decoder.scanstring, "xxx",
                          maxsize + 1)

    def test_end_out_of_bounds_is_jsondecodeerror(self):
        # Regression: C scanstring used to raise a plain ValueError for
        # out-of-range end indices, while py_scanstring raises
        # JSONDecodeError. User code with `except JSONDecodeError:` missed
        # the C path. Both backends now raise JSONDecodeError with the
        # "Unterminated string starting at" message at pos = end - 1.
        for s, end in (
            (u'"abc"', 100),
            (u'abc', 100),
            (u'', 100),
            (u'abc', -1),
            (u'', -1),
        ):
            with self.assertRaises(json.JSONDecodeError) as cm:
                json.decoder.scanstring(s, end, None, True)
            self.assertEqual(cm.exception.pos, end - 1,
                             'scanstring(%r, %r) pos=%r, expected %r' %
                             (s, end, cm.exception.pos, end - 1))
            self.assertIn('Unterminated string', str(cm.exception))

    def test_surrogates(self):
        scanstring = json.decoder.scanstring

        def assertScan(given, expect, test_utf8=True):
            givens = [given]
            if not PY3 and test_utf8:
                givens.append(given.encode('utf8'))
            for given in givens:
                (res, count) = scanstring(given, 1, None, True)
                self.assertEqual(len(given), count)
                self.assertEqual(res, expect)

        assertScan(
            u'"z\\ud834\\u0079x"',
            u'z\ud834yx')
        assertScan(
            u'"z\\ud834\\udd20x"',
            u'z\U0001d120x')
        assertScan(
            u'"z\\ud834\\ud834\\udd20x"',
            u'z\ud834\U0001d120x')
        assertScan(
            u'"z\\ud834x"',
            u'z\ud834x')
        assertScan(
            u'"z\\udd20x"',
            u'z\udd20x')
        assertScan(
            u'"z\ud834x"',
            u'z\ud834x')
        # It may look strange to join strings together, but Python is drunk.
        # https://gist.github.com/etrepum/5538443
        assertScan(
            u'"z\\ud834\udd20x12345"',
            u''.join([u'z\ud834', u'\udd20x12345']))
        assertScan(
            u'"z\ud834\\udd20x"',
            u''.join([u'z\ud834', u'\udd20x']))
        # these have different behavior given UTF8 input, because the surrogate
        # pair may be joined (in maxunicode > 65535 builds)
        assertScan(
            u''.join([u'"z\ud834', u'\udd20x"']),
            u''.join([u'z\ud834', u'\udd20x']),
            test_utf8=False)

        self.assertRaises(ValueError,
                          scanstring, u'"z\\ud83x"', 1, None, True)
        self.assertRaises(ValueError,
                          scanstring, u'"z\\ud834\\udd2x"', 1, None, True)

    def test_escape_error_parity(self):
        # Regression: the C scanstring bounds check was `end >= len` / the
        # surrogate-pair bounds check was `end + 6 < len`. Both were
        # off-by-one, causing C to raise "Invalid \\uXXXX escape sequence"
        # where pure-Python correctly raised "Unterminated string starting
        # at" when a \\uXXXX escape used the last bytes of the buffer. The
        # error-position offset also differed: C reported the position of
        # the 'u' while Python reported the position of the leading '\'.
        # This test asserts exact parity (exception class, position, and
        # message prefix) across a matrix of edge cases.
        if simplejson.decoder.c_scanstring is None:
            return

        def get_exc(scanstring, s):
            try:
                scanstring(s, 0, None, True)
            except json.JSONDecodeError as e:
                return (e.pos, str(e).split(':')[0])
            return None

        # Each case: (input, expected_pos, expected_message_prefix)
        # expected_pos == -2 means (-1, 'Unterminated string starting at');
        # otherwise the positional 'Invalid \\uXXXX escape sequence' error.
        UNTERMINATED = (-1, 'Unterminated string starting at')
        def INVALID(pos):
            return (pos, 'Invalid \\uXXXX escape sequence')

        cases = [
            # Not enough room for 4 hex digits after \u.
            (u'\\u', INVALID(0)),
            (u'\\u0', INVALID(0)),
            (u'\\u01', INVALID(0)),
            (u'\\u012', INVALID(0)),
            # 4 non-hex chars after \u — C used to raise at the 'u'.
            (u'\\uXXXX', INVALID(0)),
            # Exactly 4 hex digits at buffer end — C used to mis-report
            # 'Invalid \\uXXXX escape' instead of 'Unterminated string'.
            (u'\\u0123', UNTERMINATED),
            # Lone high surrogate with no room for a second escape.
            (u'\\ud834', UNTERMINATED),
            # High surrogate followed by a truncated second escape.
            (u'\\ud834\\u', INVALID(6)),
            (u'\\ud834\\ux', INVALID(6)),
            (u'\\ud834\\udd2', INVALID(6)),
            (u'\\ud834\\udd2x', INVALID(6)),
            # High surrogate followed by a valid low surrogate that ends
            # exactly at the buffer edge — must combine before the outer
            # loop reports an unterminated string.
            (u'\\ud834\\udd1e', UNTERMINATED),
            (u'prefix\\ud834\\udd1e', UNTERMINATED),
        ]
        for s, expected in cases:
            py = get_exc(simplejson.decoder.py_scanstring, s)
            c = get_exc(simplejson.decoder.c_scanstring, s)
            self.assertEqual(py, expected,
                             'py_scanstring(%r) expected %r, got %r' %
                             (s, expected, py))
            self.assertEqual(c, expected,
                             'c_scanstring(%r) expected %r, got %r' %
                             (s, expected, c))

        # Success paths: valid escape or surrogate pair ending at the
        # closing quote must still parse correctly.
        for scanstring in (simplejson.decoder.py_scanstring,
                           simplejson.decoder.c_scanstring):
            self.assertEqual(
                scanstring(u'\\u0123"', 0, None, True),
                (u'\u0123', 7))
            self.assertEqual(
                scanstring(u'\\ud834\\udd1e"', 0, None, True),
                (u'\U0001d11e', 13))