static PyUnicodeObject *unicode_freelist = NULL;
static int unicode_freelist_size = 0;
+/* Default encoding to use and assume when NULL is passed as encoding
+ parameter; it is initialized by _PyUnicode_Init().
+
+ Always use the PyUnicode_SetDefaultEncoding() and
+ PyUnicode_GetDefaultEncoding() APIs to access this global.
+
+*/
+
+static char unicode_default_encoding[100];
+
/* --- Unicode Object ----------------------------------------------------- */
static
Py_INCREF(unicode_empty);
return (PyObject *)unicode_empty;
}
- return PyUnicode_DecodeUTF8(s, len, "strict");
+ return PyUnicode_Decode(s, len, NULL, "strict");
}
PyObject *PyUnicode_Decode(const char *s,
{
PyObject *buffer = NULL, *unicode;
- /* Shortcut for the default encoding UTF-8 */
- if (encoding == NULL ||
- (strcmp(encoding, "utf-8") == 0))
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
+
+ /* Shortcuts for common default encodings */
+ if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_DecodeUTF8(s, size, errors);
+ else if (strcmp(encoding, "latin-1") == 0)
+ return PyUnicode_DecodeLatin1(s, size, errors);
+ else if (strcmp(encoding, "ascii") == 0)
+ return PyUnicode_DecodeASCII(s, size, errors);
/* Decode via the codec registry */
buffer = PyBuffer_FromMemory((void *)s, size);
PyErr_BadArgument();
goto onError;
}
- /* Shortcut for the default encoding UTF-8 */
- if ((encoding == NULL ||
- (strcmp(encoding, "utf-8") == 0)) &&
- errors == NULL)
+
+ if (encoding == NULL)
+ encoding = PyUnicode_GetDefaultEncoding();
+
+ /* Shortcuts for common default encodings */
+ if (errors == NULL) {
+ if (strcmp(encoding, "utf-8") == 0)
return PyUnicode_AsUTF8String(unicode);
+ else if (strcmp(encoding, "latin-1") == 0)
+ return PyUnicode_AsLatin1String(unicode);
+ else if (strcmp(encoding, "ascii") == 0)
+ return PyUnicode_AsASCIIString(unicode);
+ }
/* Encode via the codec registry */
v = PyCodec_Encode(unicode, encoding, errors);
return -1;
}
+const char *PyUnicode_GetDefaultEncoding()
+{
+ return unicode_default_encoding;
+}
+
+int PyUnicode_SetDefaultEncoding(const char *encoding)
+{
+ PyObject *v;
+
+ /* Make sure the encoding is valid. As side effect, this also
+ loads the encoding into the codec registry cache. */
+ v = _PyCodec_Lookup(encoding);
+ if (v == NULL)
+ goto onError;
+ Py_DECREF(v);
+ strncpy(unicode_default_encoding,
+ encoding,
+ sizeof(unicode_default_encoding));
+ return 0;
+
+ onError:
+ return -1;
+}
+
/* --- UTF-8 Codec -------------------------------------------------------- */
static
}
else {
PyErr_Format(PyExc_ValueError,
- "UTF-16 decoding error; unknown error handling code: %.400s",
+ "UTF-16 decoding error; "
+ "unknown error handling code: %.400s",
errors);
return -1;
}
static char encode__doc__[] =
"S.encode([encoding[,errors]]) -> string\n\
\n\
-Return an encoded string version of S. Default encoding is 'UTF-8'.\n\
-errors may be given to set a different error handling scheme. Default\n\
-is 'strict' meaning that encoding errors raise a ValueError. Other\n\
-possible values are 'ignore' and 'replace'.";
+Return an encoded string version of S. Default encoding is the current\n\
+default string encoding. errors may be given to set a different error\n\
+handling scheme. Default is 'strict' meaning that encoding errors raise\n\
+a ValueError. Other possible values are 'ignore' and 'replace'.";
static PyObject *
unicode_encode(PyUnicodeObject *self, PyObject *args)
static
PyObject *unicode_str(PyUnicodeObject *self)
{
- return PyUnicode_AsUTF8String((PyObject *)self);
+ return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
}
static char strip__doc__[] =
return NULL;
}
uformat = PyUnicode_FromObject(format);
+ if (uformat == NULL)
+ return NULL;
fmt = PyUnicode_AS_UNICODE(uformat);
fmtcnt = PyUnicode_GET_SIZE(uformat);
"incomplete format key");
goto onError;
}
- /* keys are converted to strings (using UTF-8) and
+ /* keys are converted to strings using UTF-8 and
then looked up since Python uses strings to hold
variables names etc. in its namespaces and we
- wouldn't want to break common idioms. The
- alternative would be using Unicode objects for the
- lookup but u"abc" and "abc" have different hash
- values (on purpose). */
+ wouldn't want to break common idioms. */
key = PyUnicode_EncodeUTF8(keystart,
keylen,
NULL);
"%s argument has non-string str()");
goto onError;
}
- unicode = PyUnicode_DecodeUTF8(PyString_AS_STRING(temp),
+ unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
PyString_GET_SIZE(temp),
+ NULL,
"strict");
Py_DECREF(temp);
temp = unicode;
Py_FatalError("Unicode configuration error: "
"sizeof(Py_UNICODE) != 2 bytes");
+ /* Init the implementation */
unicode_empty = _PyUnicode_New(0);
+ strcpy(unicode_default_encoding, "utf-8");
}
/* Finalize the Unicode implementation */