From c39211f51e377919952b139c46e295800cbc2a8d Mon Sep 17 00:00:00 2001 From: Victor Stinner Date: Wed, 29 Sep 2010 16:35:47 +0000 Subject: [PATCH] Issue #9630: Redecode filenames when setting the filesystem encoding Redecode the filenames of: - all modules: __file__ and __path__ attributes - all code objects: co_filename attribute - sys.path - sys.meta_path - sys.executable - sys.path_importer_cache (keys) Keep weak references to all code objects until initfsencoding() is called, to be able to redecode co_filename attribute of all code objects. --- Include/code.h | 7 ++ Objects/codeobject.c | 17 +++ Objects/object.c | 4 + Objects/unicodeobject.c | 8 +- Python/pythonrun.c | 258 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 293 insertions(+), 1 deletion(-) diff --git a/Include/code.h b/Include/code.h index f0f88cd8f6..bdbfaba28c 100644 --- a/Include/code.h +++ b/Include/code.h @@ -99,6 +99,13 @@ PyAPI_FUNC(int) _PyCode_CheckLineNumber(PyCodeObject* co, PyAPI_FUNC(PyObject*) PyCode_Optimize(PyObject *code, PyObject* consts, PyObject *names, PyObject *lineno_obj); +/* List of weak references to all code objects. The list is used by + initfsencoding() to redecode code filenames at startup if the filesystem + encoding changes. At initfsencoding() exit, the list is set to NULL and it + is no more used. */ + +extern PyObject *_Py_code_object_list; + #ifdef __cplusplus } #endif diff --git a/Objects/codeobject.c b/Objects/codeobject.c index da5c09ac8d..470bf56150 100644 --- a/Objects/codeobject.c +++ b/Objects/codeobject.c @@ -5,6 +5,8 @@ #define NAME_CHARS \ "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz" +PyObject *_Py_code_object_list = NULL; + /* all_name_chars(s): true iff all chars in s are valid NAME_CHARS */ static int @@ -109,8 +111,23 @@ PyCode_New(int argcount, int kwonlyargcount, co->co_lnotab = lnotab; co->co_zombieframe = NULL; co->co_weakreflist = NULL; + + if (_Py_code_object_list != NULL) { + int err; + PyObject *ref = PyWeakref_NewRef((PyObject*)co, NULL); + if (ref == NULL) + goto error; + err = PyList_Append(_Py_code_object_list, ref); + Py_DECREF(ref); + if (err) + goto error; + } } return co; + +error: + Py_DECREF(co); + return NULL; } PyCodeObject * diff --git a/Objects/object.c b/Objects/object.c index ff3363f432..e322e53103 100644 --- a/Objects/object.c +++ b/Objects/object.c @@ -1604,6 +1604,10 @@ _Py_ReadyTypes(void) if (PyType_Ready(&PyCode_Type) < 0) Py_FatalError("Can't initialize code type"); + _Py_code_object_list = PyList_New(0); + if (_Py_code_object_list == NULL) + Py_FatalError("Can't initialize code type"); + if (PyType_Ready(&PyFrame_Type) < 0) Py_FatalError("Can't initialize frame type"); diff --git a/Objects/unicodeobject.c b/Objects/unicodeobject.c index 527e219896..8ceb3f32b8 100644 --- a/Objects/unicodeobject.c +++ b/Objects/unicodeobject.c @@ -1510,10 +1510,14 @@ PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode) return PyUnicode_AsEncodedString(unicode, Py_FileSystemDefaultEncoding, "surrogateescape"); - } else + } + else { + /* if you change the default encoding, update also + PyUnicode_DecodeFSDefaultAndSize() and redecode_filenames() */ return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode), "surrogateescape"); + } } PyObject *PyUnicode_AsEncodedString(PyObject *unicode, @@ -1680,6 +1684,8 @@ PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size) "surrogateescape"); } else { + /* if you change the default encoding, update also + PyUnicode_EncodeFSDefault() and redecode_filenames() */ return PyUnicode_DecodeUTF8(s, size, "surrogateescape"); } } diff --git a/Python/pythonrun.c b/Python/pythonrun.c index a3f5d2b667..a888a19f20 100644 --- a/Python/pythonrun.c +++ b/Python/pythonrun.c @@ -719,6 +719,259 @@ initmain(void) } } +/* Redecode a filename from the default filesystem encoding (utf-8) to + 'new_encoding' encoding with 'errors' error handler */ +static PyObject* +redecode_filename(PyObject *file, const char *new_encoding, + const char *errors) +{ + PyObject *file_bytes = NULL, *new_file = NULL; + + file_bytes = PyUnicode_EncodeFSDefault(file); + if (file_bytes == NULL) + return NULL; + new_file = PyUnicode_Decode( + PyBytes_AsString(file_bytes), + PyBytes_GET_SIZE(file_bytes), + new_encoding, + errors); + Py_DECREF(file_bytes); + return new_file; +} + +/* Redecode a path list */ +static int +redecode_path_list(PyObject *paths, + const char *new_encoding, const char *errors) +{ + PyObject *filename, *new_filename; + Py_ssize_t i, size; + + size = PyList_Size(paths); + for (i=0; i < size; i++) { + filename = PyList_GetItem(paths, i); + if (filename == NULL) + return -1; + + new_filename = redecode_filename(filename, new_encoding, errors); + if (new_filename == NULL) + return -1; + if (PyList_SetItem(paths, i, new_filename)) { + Py_DECREF(new_filename); + return -1; + } + } + return 0; +} + +/* Redecode __file__ and __path__ attributes of sys.modules */ +static int +redecode_sys_modules(const char *new_encoding, const char *errors) +{ + PyInterpreterState *interp; + PyObject *modules, *values, *file, *new_file, *paths; + PyObject *iter = NULL, *module = NULL; + + interp = PyThreadState_GET()->interp; + modules = interp->modules; + + values = PyObject_CallMethod(modules, "values", ""); + if (values == NULL) + goto error; + + iter = PyObject_GetIter(values); + Py_DECREF(values); + if (iter == NULL) + goto error; + + while (1) + { + module = PyIter_Next(iter); + if (module == NULL) { + if (PyErr_Occurred()) + goto error; + else + break; + } + + file = PyModule_GetFilenameObject(module); + if (file != NULL) { + new_file = redecode_filename(file, new_encoding, errors); + Py_DECREF(file); + if (new_file == NULL) + goto error; + if (PyObject_SetAttrString(module, "__file__", new_file)) { + Py_DECREF(new_file); + goto error; + } + Py_DECREF(new_file); + } + else + PyErr_Clear(); + + paths = PyObject_GetAttrString(module, "__path__"); + if (paths != NULL) { + if (redecode_path_list(paths, new_encoding, errors)) + goto error; + } + else + PyErr_Clear(); + + Py_CLEAR(module); + } + Py_CLEAR(iter); + return 0; + +error: + Py_XDECREF(iter); + Py_XDECREF(module); + return -1; +} + +/* Redecode sys.path_importer_cache keys */ +static int +redecode_sys_path_importer_cache(const char *new_encoding, const char *errors) +{ + PyObject *path_importer_cache, *items, *item, *path, *importer, *new_path; + PyObject *new_cache = NULL, *iter = NULL; + + path_importer_cache = PySys_GetObject("path_importer_cache"); + if (path_importer_cache == NULL) + goto error; + + items = PyObject_CallMethod(path_importer_cache, "items", ""); + if (items == NULL) + goto error; + + iter = PyObject_GetIter(items); + Py_DECREF(items); + if (iter == NULL) + goto error; + + new_cache = PyDict_New(); + if (new_cache == NULL) + goto error; + + while (1) + { + item = PyIter_Next(iter); + if (item == NULL) { + if (PyErr_Occurred()) + goto error; + else + break; + } + path = PyTuple_GET_ITEM(item, 0); + importer = PyTuple_GET_ITEM(item, 1); + + new_path = redecode_filename(path, new_encoding, errors); + if (new_path == NULL) + goto error; + if (PyDict_SetItem(new_cache, new_path, importer)) { + Py_DECREF(new_path); + goto error; + } + Py_DECREF(new_path); + } + Py_CLEAR(iter); + if (PySys_SetObject("path_importer_cache", new_cache)) + goto error; + Py_CLEAR(new_cache); + return 0; + +error: + Py_XDECREF(iter); + Py_XDECREF(new_cache); + return -1; +} + +/* Redecode co_filename attribute of all code objects */ +static int +redecode_code_objects(const char *new_encoding, const char *errors) +{ + Py_ssize_t i, len; + PyCodeObject *co; + PyObject *ref, *new_file; + + len = Py_SIZE(_Py_code_object_list); + for (i=0; i < len; i++) { + ref = PyList_GET_ITEM(_Py_code_object_list, i); + co = (PyCodeObject *)PyWeakref_GetObject(ref); + if ((PyObject*)co == Py_None) + continue; + if (co == NULL) + return -1; + + new_file = redecode_filename(co->co_filename, new_encoding, errors); + if (new_file == NULL) + return -1; + Py_DECREF(co->co_filename); + co->co_filename = new_file; + } + Py_CLEAR(_Py_code_object_list); + return 0; +} + +/* Redecode the filenames of all modules (__file__ and __path__ attributes), + all code objects (co_filename attribute), sys.path, sys.meta_path, + sys.executable and sys.path_importer_cache (keys) when the filesystem + encoding changes from the default encoding (utf-8) to new_encoding */ +static int +redecode_filenames(const char *new_encoding) +{ + char *errors; + PyObject *paths, *executable, *new_executable; + + /* PyUnicode_DecodeFSDefault() and PyUnicode_EncodeFSDefault() do already + use utf-8 if Py_FileSystemDefaultEncoding is NULL */ + if (strcmp(new_encoding, "utf-8") == 0) + return 0; + + if (strcmp(new_encoding, "mbcs") != 0) + errors = "surrogateescape"; + else + errors = NULL; + + /* sys.modules */ + if (redecode_sys_modules(new_encoding, errors)) + return -1; + + /* sys.path and sys.meta_path */ + paths = PySys_GetObject("path"); + if (paths != NULL) { + if (redecode_path_list(paths, new_encoding, errors)) + return -1; + } + paths = PySys_GetObject("meta_path"); + if (paths != NULL) { + if (redecode_path_list(paths, new_encoding, errors)) + return -1; + } + + /* sys.executable */ + executable = PySys_GetObject("executable"); + if (executable == NULL) + return -1; + new_executable = redecode_filename(executable, new_encoding, errors); + if (new_executable == NULL) + return -1; + if (PySys_SetObject("executable", new_executable)) { + Py_DECREF(new_executable); + return -1; + } + Py_DECREF(new_executable); + + /* sys.path_importer_cache */ + if (redecode_sys_path_importer_cache(new_encoding, errors)) + return -1; + + /* code objects */ + if (redecode_code_objects(new_encoding, errors)) + return -1; + + return 0; +} + static void initfsencoding(void) { @@ -744,8 +997,11 @@ initfsencoding(void) codeset = get_codeset(); } if (codeset != NULL) { + if (redecode_filenames(codeset)) + Py_FatalError("Py_Initialize: can't redecode filenames"); Py_FileSystemDefaultEncoding = codeset; Py_HasFileSystemDefaultEncoding = 0; + Py_CLEAR(_Py_code_object_list); return; } else { fprintf(stderr, "Unable to get the locale encoding:\n"); @@ -758,6 +1014,8 @@ initfsencoding(void) } #endif + Py_CLEAR(_Py_code_object_list); + /* the encoding is mbcs, utf-8 or ascii */ codec = _PyCodec_Lookup(Py_FileSystemDefaultEncoding); if (!codec) { -- 2.40.0