SF patch #438013 Remove 2-byte Py_UCS2 assumptions

author Tim Peters <tim.peters@gmail.com>

Thu, 9 Aug 2001 22:21:55 +0000 (22:21 +0000)

committer Tim Peters <tim.peters@gmail.com>

Thu, 9 Aug 2001 22:21:55 +0000 (22:21 +0000)
author Tim Peters <tim.peters@gmail.com>
Thu, 9 Aug 2001 22:21:55 +0000 (22:21 +0000)
committer Tim Peters <tim.peters@gmail.com>
Thu, 9 Aug 2001 22:21:55 +0000 (22:21 +0000)
diff --git a/Include/unicodeobject.h b/Include/unicodeobject.h

index 04c5b3e2d366c1442763318a93b3c95cc2376991..a7e50c3ce86a3aa8130ffcd29da2bdce1303e232 100644 (file)
--- a/Include/unicodeobject.h
+++ b/Include/unicodeobject.h
@@ -121,12 +121,6 @@ typedef unsigned int Py_UCS4;
  typedef unsigned long Py_UCS4; 
  #endif
  
-#if SIZEOF_SHORT == 2
-typedef unsigned short Py_UCS2;
-#else
-#error Cannot find a two-byte type
-#endif 
-
  typedef PY_UNICODE_TYPE Py_UNICODE;
  
  /* --- UCS-2/UCS-4 Name Mangling ------------------------------------------ */
diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c

index 1319c7c52ad3ab13dc6659de5ee605e34f1ff36a..8bd1287124c9b99b9cada71f02548f0c2b338245 100644 (file)
--- a/Objects/unicodeobject.c
+++ b/Objects/unicodeobject.c
@@ -944,8 +944,7 @@ PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
  /* --- UTF-16 Codec ------------------------------------------------------- */
  
  static
-int utf16_decoding_error(const Py_UCS2 **source,
-                        Py_UNICODE **dest,
+int utf16_decoding_error(Py_UNICODE **dest,
                          const char *errors,
                          const char *details) 
  {
@@ -975,23 +974,29 @@ int utf16_decoding_error(const Py_UCS2 **source,
      }
  }
  
-PyObject *PyUnicode_DecodeUTF16(const char *s,
-                               int size,
-                               const char *errors,
-                               int *byteorder)
+PyObject *
+PyUnicode_DecodeUTF16(const char *s,
+                     int size,
+                     const char *errors,
+                     int *byteorder)
  {
      PyUnicodeObject *unicode;
      Py_UNICODE *p;
-    const Py_UCS2 *q, *e;
-    int bo = 0;
+    const unsigned char *q, *e;
+    int bo = 0;       /* assume native ordering by default */
      const char *errmsg = "";
+    /* Offsets from q for retrieving byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = 1, ilo = 0;
+#else
+    int ihi = 0, ilo = 1;
+#endif
  
      /* size should be an even number */
-    if (size % sizeof(Py_UCS2) != 0) {
-       if (utf16_decoding_error(NULL, NULL, errors, "truncated data"))
-           return NULL;
-       /* The remaining input chars are ignored if we fall through
-           here... */
+    if (size & 1) {
+        if (utf16_decoding_error(NULL, errors, "truncated data"))
+            return NULL;
+        --size;  /* else ignore the oddball byte */
      }
  
      /* Note: size will always be longer than the resulting Unicode
@@ -1004,48 +1009,54 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
  
      /* Unpack UTF-16 encoded data */
      p = unicode->str;
-    q = (Py_UCS2 *)s;
-    e = q + (size / sizeof(Py_UCS2));
+    q = (unsigned char *)s;
+    e = q + size;
  
      if (byteorder)
-       bo = *byteorder;
+        bo = *byteorder;
  
      /* Check for BOM marks (U+FEFF) in the input and adjust current
         byte order setting accordingly. In native mode, the leading BOM
         mark is skipped, in all other modes, it is copied to the output
         stream as-is (giving a ZWNBSP character). */
      if (bo == 0) {
+        const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
  #ifdef BYTEORDER_IS_LITTLE_ENDIAN
-       if (*q == 0xFEFF) {
-           q++;
+       if (bom == 0xFEFF) {
+           q += 2;
             bo = -1;
-       } else if (*q == 0xFFFE) {
-           q++;
+       }
+        else if (bom == 0xFFFE) {
+           q += 2;
             bo = 1;
         }
  #else    
-       if (*q == 0xFEFF) {
-           q++;
+       if (bom == 0xFEFF) {
+           q += 2;
             bo = 1;
-       } else if (*q == 0xFFFE) {
-           q++;
+       }
+        else if (bom == 0xFFFE) {
+           q += 2;
             bo = -1;
         }
  #endif
      }
-    
+
+    if (bo == -1) {
+        /* force LE */
+        ihi = 1;
+        ilo = 0;
+    }
+    else if (bo == 1) {
+        /* force BE */
+        ihi = 0;
+        ilo = 1;
+    }
+
      while (q < e) {
-       register Py_UCS2 ch = *q++;
+       Py_UNICODE ch = (q[ihi] << 8) | q[ilo];
+       q += 2;
  
-       /* Swap input bytes if needed. (This assumes
-          sizeof(Py_UNICODE) == 2 !) */
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-       if (bo == 1)
-           ch = (ch >> 8) | (ch << 8);
-#else    
-       if (bo == -1)
-           ch = (ch >> 8) | (ch << 8);
-#endif
         if (ch < 0xD800 || ch > 0xDFFF) {
             *p++ = ch;
             continue;
@@ -1057,14 +1068,8 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
             goto utf16Error;
         }
         if (0xD800 <= ch && ch <= 0xDBFF) {
-           Py_UCS2 ch2 = *q++;
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN
-           if (bo == 1)
-                   ch2 = (ch2 >> 8) | (ch2 << 8);
-#else    
-           if (bo == -1)
-                   ch2 = (ch2 >> 8) | (ch2 << 8);
-#endif
+           Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
+           q += 2;
             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
  #ifndef Py_UNICODE_WIDE
                 *p++ = ch;
@@ -1084,7 +1089,7 @@ PyObject *PyUnicode_DecodeUTF16(const char *s,
         /* Fall through to report the error */
  
      utf16Error:
-       if (utf16_decoding_error(&q, &p, errors, errmsg))
+       if (utf16_decoding_error(&p, errors, errmsg))
             goto onError;
      }
  
@@ -1102,58 +1107,67 @@ onError:
      return NULL;
  }
  
-#undef UTF16_ERROR
-
-PyObject *PyUnicode_EncodeUTF16(const Py_UNICODE *s,
-                               int size,
-                               const char *errors,
-                               int byteorder)
+PyObject *
+PyUnicode_EncodeUTF16(const Py_UNICODE *s,
+                     int size,
+                     const char *errors,
+                     int byteorder)
  {
      PyObject *v;
-    Py_UCS2 *p;
-    char *q;
-    int i, pairs, doswap = 1;
+    unsigned char *p;
+    int i, pairs;
+    /* Offsets from p for storing byte pairs in the right order. */
+#ifdef BYTEORDER_IS_LITTLE_ENDIAN
+    int ihi = 1, ilo = 0;
+#else
+    int ihi = 0, ilo = 1;
+#endif
+
+#define STORECHAR(CH)                   \
+    do {                                \
+        p[ihi] = ((CH) >> 8) & 0xff;    \
+        p[ilo] = (CH) & 0xff;           \
+        p += 2;                         \
+    } while(0)
  
      for (i = pairs = 0; i < size; i++)
         if (s[i] >= 0x10000)
             pairs++;
      v = PyString_FromStringAndSize(NULL, 
-                 sizeof(Py_UCS2) * (size + pairs + (byteorder == 0)));
+                 2 * (size + pairs + (byteorder == 0)));
      if (v == NULL)
          return NULL;
  
-    q = PyString_AS_STRING(v);
-    p = (Py_UCS2 *)q;
+    p = (unsigned char *)PyString_AS_STRING(v);
      if (byteorder == 0)
-       *p++ = 0xFEFF;
+       STORECHAR(0xFEFF);
      if (size == 0)
          return v;
-    if (byteorder == 0 ||
-#ifdef BYTEORDER_IS_LITTLE_ENDIAN      
-       byteorder == -1
-#else
-       byteorder == 1
-#endif
-       )
-       doswap = 0;
+
+    if (byteorder == -1) {
+        /* force LE */
+        ihi = 1;
+        ilo = 0;
+    }
+    else if (byteorder == 1) {
+        /* force BE */
+        ihi = 0;
+        ilo = 1;
+    }
+
      while (size-- > 0) {
         Py_UNICODE ch = *s++;
         Py_UNICODE ch2 = 0;
         if (ch >= 0x10000) {
-           ch2 = 0xDC00|((ch-0x10000) & 0x3FF);
-           ch  = 0xD800|((ch-0x10000)>>10);
-       }
-       if (doswap){
-           *p++ = (ch >> 8) | (ch << 8);
-           if (ch2)
-               *p++ = (ch2 >> 8) | (ch2 << 8);
-       }else{
-           *p++ = ch;
-           if(ch2)
-               *p++ = ch2;
+           ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
+           ch  = 0xD800 | ((ch-0x10000) >> 10);
         }
+        STORECHAR(ch);
+        if (ch2)
+            STORECHAR(ch2);
      }
      return v;
+#undef STORECHAR
  }
  
  PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
author	Tim Peters <tim.peters@gmail.com>
	Thu, 9 Aug 2001 22:21:55 +0000 (22:21 +0000)
committer	Tim Peters <tim.peters@gmail.com>
	Thu, 9 Aug 2001 22:21:55 +0000 (22:21 +0000)
Include/unicodeobject.h		patch \| blob \| history
Objects/unicodeobject.c		patch \| blob \| history