]> granicus.if.org Git - python/commitdiff
Marc-Andre Lemburg:
authorGuido van Rossum <guido@python.org>
Fri, 24 Mar 2000 22:14:19 +0000 (22:14 +0000)
committerGuido van Rossum <guido@python.org>
Fri, 24 Mar 2000 22:14:19 +0000 (22:14 +0000)
Attached you find the latest update of the Unicode implementation.
The patch is against the current CVS version.

It includes the fix I posted yesterday for the core dump problem
in codecs.c (was introduced by my previous patch set -- sorry),
adds more tests for the codecs and two new parser markers
"es" and "es#".

Lib/codecs.py
Lib/test/output/test_unicode
Lib/test/test_unicode.py
Misc/unicode.txt
Python/getargs.c

index 7f478d7191d2c113c653ece304b57821ec6761d1..c09f804f1b6dfc416d8ac68ea8b8c72b5de8264c 100644 (file)
@@ -46,7 +46,7 @@ class Codec:
         handling schemes by providing the errors argument. These
         string values are defined:
 
-         'strict' - raise an error (or a subclass)
+         'strict' - raise a ValueError error (or a subclass)
          'ignore' - ignore the character and continue with the next
          'replace' - replace with a suitable replacement character;
                     Python will use the official U+FFFD REPLACEMENT
index 382a631fd3d2fbec796ce2ac4e8d3ebffc6e2267..1ec9031045efed28dc1bb8008c5ba6abe3b0ecf4 100644 (file)
@@ -1,5 +1,4 @@
 test_unicode
 Testing Unicode comparisons... done.
-Testing Unicode contains method... done.
 Testing Unicode formatting strings... done.
 Testing unicodedata module... done.
index 69d4273ace8fa2fa60cdb64a14cebdf507442d2a..3d15f22a4efefd936be6c67504fccf01d5fb22a1 100644 (file)
@@ -293,3 +293,33 @@ else:
     assert unicodedata.combining(u'\u20e1') == 230
     
     print 'done.'
+
+# Test builtin codecs
+print 'Testing builtin codecs...',
+
+assert unicode('hello','ascii') == u'hello'
+assert unicode('hello','utf-8') == u'hello'
+assert unicode('hello','utf8') == u'hello'
+assert unicode('hello','latin-1') == u'hello'
+
+assert u'hello'.encode('ascii') == 'hello'
+assert u'hello'.encode('utf-8') == 'hello'
+assert u'hello'.encode('utf8') == 'hello'
+assert u'hello'.encode('utf-16-le') == 'h\000e\000l\000l\000o\000'
+assert u'hello'.encode('utf-16-be') == '\000h\000e\000l\000l\000o'
+assert u'hello'.encode('latin-1') == 'hello'
+
+u = u''.join(map(unichr, range(1024)))
+for encoding in ('utf-8', 'utf-16', 'utf-16-le', 'utf-16-be',
+                 'raw_unicode_escape', 'unicode_escape', 'unicode_internal'):
+    assert unicode(u.encode(encoding),encoding) == u
+
+u = u''.join(map(unichr, range(256)))
+for encoding in ('latin-1',):
+    assert unicode(u.encode(encoding),encoding) == u
+
+u = u''.join(map(unichr, range(128)))
+for encoding in ('ascii',):
+    assert unicode(u.encode(encoding),encoding) == u
+
+print 'done.'
index 9a4832afce8ddba28421bc9686f700d0fff702e7..fc1f2c5a24939e5d08a44164bbaabc2e17a4a587 100644 (file)
@@ -715,21 +715,126 @@ Internal Argument Parsing:
 
 These markers are used by the PyArg_ParseTuple() APIs:
 
-  'U':  Check for Unicode object and return a pointer to it
+  "U":  Check for Unicode object and return a pointer to it
 
-  's':  For Unicode objects: auto convert them to the <default encoding>
+  "s":  For Unicode objects: auto convert them to the <default encoding>
         and return a pointer to the object's <defencstr> buffer.
 
-  's#': Access to the Unicode object via the bf_getreadbuf buffer interface 
+  "s#": Access to the Unicode object via the bf_getreadbuf buffer interface 
         (see Buffer Interface); note that the length relates to the buffer
         length, not the Unicode string length (this may be different
         depending on the Internal Format).
 
-  't#': Access to the Unicode object via the bf_getcharbuf buffer interface
+  "t#": Access to the Unicode object via the bf_getcharbuf buffer interface
         (see Buffer Interface); note that the length relates to the buffer
         length, not necessarily to the Unicode string length (this may
         be different depending on the <default encoding>).
 
+  "es": 
+       Takes two parameters: encoding (const char *) and
+       buffer (char **). 
+
+       The input object is first coerced to Unicode in the usual way
+       and then encoded into a string using the given encoding.
+
+       On output, a buffer of the needed size is allocated and
+       returned through *buffer as NULL-terminated string.
+       The encoded may not contain embedded NULL characters.
+       The caller is responsible for free()ing the allocated *buffer
+       after usage.
+
+  "es#":
+       Takes three parameters: encoding (const char *),
+       buffer (char **) and buffer_len (int *).
+       
+       The input object is first coerced to Unicode in the usual way
+       and then encoded into a string using the given encoding.
+
+       If *buffer is non-NULL, *buffer_len must be set to sizeof(buffer)
+       on input. Output is then copied to *buffer.
+
+       If *buffer is NULL, a buffer of the needed size is
+       allocated and output copied into it. *buffer is then
+       updated to point to the allocated memory area. The caller
+       is responsible for free()ing *buffer after usage.
+
+       In both cases *buffer_len is updated to the number of
+       characters written (excluding the trailing NULL-byte).
+       The output buffer is assured to be NULL-terminated.
+
+Examples:
+
+Using "es#" with auto-allocation:
+
+    static PyObject *
+    test_parser(PyObject *self,
+               PyObject *args)
+    {
+       PyObject *str;
+       const char *encoding = "latin-1";
+       char *buffer = NULL;
+       int buffer_len = 0;
+
+       if (!PyArg_ParseTuple(args, "es#:test_parser",
+                             encoding, &buffer, &buffer_len))
+           return NULL;
+       if (!buffer) {
+           PyErr_SetString(PyExc_SystemError,
+                           "buffer is NULL");
+           return NULL;
+       }
+       str = PyString_FromStringAndSize(buffer, buffer_len);
+       free(buffer);
+       return str;
+    }
+
+Using "es" with auto-allocation returning a NULL-terminated string:    
+    
+    static PyObject *
+    test_parser(PyObject *self,
+               PyObject *args)
+    {
+       PyObject *str;
+       const char *encoding = "latin-1";
+       char *buffer = NULL;
+
+       if (!PyArg_ParseTuple(args, "es:test_parser",
+                             encoding, &buffer))
+           return NULL;
+       if (!buffer) {
+           PyErr_SetString(PyExc_SystemError,
+                           "buffer is NULL");
+           return NULL;
+       }
+       str = PyString_FromString(buffer);
+       free(buffer);
+       return str;
+    }
+
+Using "es#" with a pre-allocated buffer:
+    
+    static PyObject *
+    test_parser(PyObject *self,
+               PyObject *args)
+    {
+       PyObject *str;
+       const char *encoding = "latin-1";
+       char _buffer[10];
+       char *buffer = _buffer;
+       int buffer_len = sizeof(_buffer);
+
+       if (!PyArg_ParseTuple(args, "es#:test_parser",
+                             encoding, &buffer, &buffer_len))
+           return NULL;
+       if (!buffer) {
+           PyErr_SetString(PyExc_SystemError,
+                           "buffer is NULL");
+           return NULL;
+       }
+       str = PyString_FromStringAndSize(buffer, buffer_len);
+       return str;
+    }
+
 
 File/Stream Output:
 -------------------
@@ -837,6 +942,7 @@ Encodings:
 
 History of this Proposal:
 -------------------------
+1.3: Added new "es" and "es#" parser markers
 1.2: Removed POD about codecs.open()
 1.1: Added note about comparisons and hash values. Added note about
      case mapping algorithms. Changed stream codecs .read() and
index 4617d0515e4fcae5381989e0ce721357b3933445..a4b0fe4c8d72758f05f486f43dd0626a14291a5d 100644 (file)
@@ -178,6 +178,8 @@ vgetargs1(args, format, p_va, compat)
                }
                else if (level != 0)
                        ; /* Pass */
+               else if (c == 'e')
+                       ; /* Pass */
                else if (isalpha(c))
                        max++;
                else if (c == '|')
@@ -654,6 +656,122 @@ convertsimple1(arg, p_format, p_va)
                        break;
                }
        
+       case 'e': /* encoded string */
+               {
+                       char **buffer;
+                       const char *encoding;
+                       PyObject *u, *s;
+                       int size;
+
+                       /* Get 'e' parameter: the encoding name */
+                       encoding = (const char *)va_arg(*p_va, const char *);
+                       if (encoding == NULL)
+                               return "(encoding is NULL)";
+                       
+                       /* Get 's' parameter: the output buffer to use */
+                       if (*format != 's')
+                               return "(unkown parser marker combination)";
+                       buffer = (char **)va_arg(*p_va, char **);
+                       format++;
+                       if (buffer == NULL)
+                               return "(buffer is NULL)";
+                       
+                       /* Convert object to Unicode */
+                       u = PyUnicode_FromObject(arg);
+                       if (u == NULL)
+                               return "string, unicode or text buffer";
+                       
+                       /* Encode object; use default error handling */
+                       s = PyUnicode_AsEncodedString(u,
+                                                     encoding,
+                                                     NULL);
+                       Py_DECREF(u);
+                       if (s == NULL)
+                               return "(encoding failed)";
+                       if (!PyString_Check(s)) {
+                               Py_DECREF(s);
+                               return "(encoder failed to return a string)";
+                       }
+                       size = PyString_GET_SIZE(s);
+
+                       /* Write output; output is guaranteed to be
+                          0-terminated */
+                       if (*format == '#') { 
+                               /* Using buffer length parameter '#':
+
+                                  - if *buffer is NULL, a new buffer
+                                  of the needed size is allocated and
+                                  the data copied into it; *buffer is
+                                  updated to point to the new buffer;
+                                  the caller is responsible for
+                                  free()ing it after usage
+
+                                  - if *buffer is not NULL, the data
+                                  is copied to *buffer; *buffer_len
+                                  has to be set to the size of the
+                                  buffer on input; buffer overflow is
+                                  signalled with an error; buffer has
+                                  to provide enough room for the
+                                  encoded string plus the trailing
+                                  0-byte
+
+                                  - in both cases, *buffer_len is
+                                  updated to the size of the buffer
+                                  /excluding/ the trailing 0-byte
+
+                               */
+                               int *buffer_len = va_arg(*p_va, int *);
+
+                               format++;
+                               if (buffer_len == NULL)
+                                       return "(buffer_len is NULL)";
+                               if (*buffer == NULL) {
+                                       *buffer = PyMem_NEW(char, size + 1);
+                                       if (*buffer == NULL) {
+                                               Py_DECREF(s);
+                                               return "(memory error)";
+                                       }
+                               } else {
+                                       if (size + 1 > *buffer_len) {
+                                               Py_DECREF(s);
+                                               return "(buffer overflow)";
+                                       }
+                               }
+                               memcpy(*buffer,
+                                      PyString_AS_STRING(s),
+                                      size + 1);
+                               *buffer_len = size;
+                       } else {
+                               /* Using a 0-terminated buffer:
+
+                                  - the encoded string has to be
+                                  0-terminated for this variant to
+                                  work; if it is not, an error raised
+
+                                  - a new buffer of the needed size
+                                  is allocated and the data copied
+                                  into it; *buffer is updated to
+                                  point to the new buffer; the caller
+                                  is responsible for free()ing it
+                                  after usage
+
+                                */
+                               if (strlen(PyString_AS_STRING(s)) != size)
+                                       return "(encoded string without "\
+                                              "NULL bytes)";
+                               *buffer = PyMem_NEW(char, size + 1);
+                               if (*buffer == NULL) {
+                                       Py_DECREF(s);
+                                       return "(memory error)";
+                               }
+                               memcpy(*buffer,
+                                      PyString_AS_STRING(s),
+                                      size + 1);
+                       }
+                       Py_DECREF(s);
+                       break;
+               }
+
        case 'S': /* string object */
                {
                        PyObject **p = va_arg(*p_va, PyObject **);