From: Serhiy Storchaka <storchaka@gmail.com>
Date: Thu, 7 Feb 2013 14:25:25 +0000 (+0200)
Subject: Issue #17043: The unicode-internal decoder no longer read past the end of
X-Git-Tag: v3.3.1rc1~213
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=03ee12ed7251b6b251d55d708a22616ed2538b19;p=python

Issue #17043: The unicode-internal decoder no longer read past the end of
input buffer.
---

03ee12ed7251b6b251d55d708a22616ed2538b19
diff --cc Misc/NEWS
index b63511c4ef,c715170df2..9491614912
--- a/Misc/NEWS
+++ b/Misc/NEWS
@@@ -12,9 -10,9 +12,12 @@@ What's New in Python 3.3.1
  Core and Builtins
  -----------------
  
+ - Issue #17043: The unicode-internal decoder no longer read past the end of
+   input buffer.
+ 
 +- Issue #17098: All modules now have __loader__ set even if they pre-exist the
 +  bootstrapping of importlib.
 +
  - Issue #16979: Fix error handling bugs in the unicode-escape-decode decoder.
  
  - Issue #10156: In the interpreter's initialization phase, unicode globals
diff --cc Objects/unicodeobject.c
index e8459138a8,cd4e9e9295..abe793dfd4
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@@ -6101,65 -4392,38 +6101,61 @@@ _PyUnicode_DecodeUnicodeInternal(const 
      end = s + size;
  
      while (s < end) {
 -        if (end-s < Py_UNICODE_SIZE) {
 +        Py_UNICODE uch;
 +        Py_UCS4 ch;
++        if (end - s < Py_UNICODE_SIZE) {
+             endinpos = end-starts;
+             reason = "truncated input";
+             goto error;
+         }
 -        memcpy(p, s, sizeof(Py_UNICODE));
 +        /* We copy the raw representation one byte at a time because the
 +           pointer may be unaligned (see test_codeccallbacks). */
 +        ((char *) &uch)[0] = s[0];
 +        ((char *) &uch)[1] = s[1];
 +#ifdef Py_UNICODE_WIDE
 +        ((char *) &uch)[2] = s[2];
 +        ((char *) &uch)[3] = s[3];
 +#endif
 +        ch = uch;
- 
+ #ifdef Py_UNICODE_WIDE
          /* We have to sanity check the raw data, otherwise doom looms for
             some malformed UCS-4 data. */
-         if (
- #ifdef Py_UNICODE_WIDE
-             ch > 0x10ffff ||
- #endif
-             end-s < Py_UNICODE_SIZE
-             )
-         {
-             startinpos = s - starts;
-             if (end-s < Py_UNICODE_SIZE) {
-                 endinpos = end-starts;
-                 reason = "truncated input";
-             }
-             else {
-                 endinpos = s - starts + Py_UNICODE_SIZE;
-                 reason = "illegal code point (> 0x10FFFF)";
-             }
-             if (unicode_decode_call_errorhandler(
-                     errors, &errorHandler,
-                     "unicode_internal", reason,
-                     &starts, &end, &startinpos, &endinpos, &exc, &s,
-                     &v, &outpos))
-                 goto onError;
-             continue;
 -        if (*p > unimax || *p < 0) {
++        if (ch > 0x10ffff) {
+             endinpos = s - starts + Py_UNICODE_SIZE;
+             reason = "illegal code point (> 0x10FFFF)";
+             goto error;
          }
- 
+ #endif
 -        p++;
          s += Py_UNICODE_SIZE;
 +#ifndef Py_UNICODE_WIDE
-         if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && s < end)
++        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
 +        {
 +            Py_UNICODE uch2;
 +            ((char *) &uch2)[0] = s[0];
 +            ((char *) &uch2)[1] = s[1];
 +            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
 +            {
 +                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
 +                s += Py_UNICODE_SIZE;
 +            }
 +        }
 +#endif
 +
 +        if (unicode_putchar(&v, &outpos, ch) < 0)
 +            goto onError;
+         continue;
+ 
+   error:
+         startinpos = s - starts;
 -        outpos = p - PyUnicode_AS_UNICODE(v);
+         if (unicode_decode_call_errorhandler(
+                 errors, &errorHandler,
+                 "unicode_internal", reason,
+                 &starts, &end, &startinpos, &endinpos, &exc, &s,
 -                &v, &outpos, &p)) {
++                &v, &outpos))
+             goto onError;
 -        }
      }
  
 -    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
 +    if (unicode_resize(&v, outpos) < 0)
          goto onError;
      Py_XDECREF(errorHandler);
      Py_XDECREF(exc);