From 5f430e94ae31c4192be9c28d8f01d01591998537 Mon Sep 17 00:00:00 2001 From: Reuben Thomas Date: Wed, 17 Jan 2018 22:43:10 +0000 Subject: [PATCH] Try to diagnose untranslatable input when using iconv MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit See Debian bug #348909. The problem starts with the fact that iconv returns EILSEQ (invalid input) when in fact the input is merely untranslatable. It is possible to diagnose this situation by running another conversion with the output encoding the same as the input (so that it will always succeed on valid input) at the same point. This is what we now do. Unfortunately, there’s no way I can see to work out how much input to skip (i.e. the length of the untranslatable character in the source encoding). Hence, we still just skip one byte. The typical result is that invalid input is diagnosed on the next step, resulting in the same problem as at present. Two possible workarounds are to not use iconv, or to set abort_level to RECODE_UNTRANSLATABLE (this is what test_2 in t80_error.py does). --- doc/recode.texi | 7 +++++++ src/iconv.c | 36 ++++++++++++++++++++++++++++++++++-- tests/t80_error.py | 16 +++++++++++++--- 3 files changed, 54 insertions(+), 5 deletions(-) diff --git a/doc/recode.texi b/doc/recode.texi index 1f0367f..69038f0 100644 --- a/doc/recode.texi +++ b/doc/recode.texi @@ -2568,6 +2568,13 @@ mode prohibits such attribution of reversible translations: so strict mode might often trigger such an error. Most @code{UCS-2} codes used to represent Asian characters cannot be expressed in various Latin charsets. +Since iconv does not distinguish untranslatable from invalid input, +Recode has to use a workaround to detect when input is untranslatable. +Unfortunately, it cannot currently tell how much input is untranslatable, +so it cannot reliably skip such input: typically the input is then diagnosed +as invalid. Two possible workarounds are to set the @code{abort_level} +to @code{RECODE_UNTRANSLATABLE}, or not to use iconv. + @item RECODE_INVALID_INPUT @vindex RECODE_INVALID_INPUT @cindex invalid input, error message diff --git a/src/iconv.c b/src/iconv.c index d7fb979..c1b7a06 100644 --- a/src/iconv.c +++ b/src/iconv.c @@ -102,8 +102,40 @@ wrapped_transform (iconv_t conversion, RECODE_SUBTASK subtask) { if (saved_errno == EILSEQ) { - /* Invalid input. Skip one byte. */ - RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask); + /* Check whether the input was really just untranslatable. */ + enum recode_error recode_error = RECODE_INVALID_INPUT; + RECODE_CONST_STEP step = subtask->step; + iconv_t check_conversion = iconv_open (step->before->iconv_name, + step->before->iconv_name); + + /* On error, give up and assume input is invalid. */ + if (input_left > 0 && check_conversion != (iconv_t) -1) + { + /* Assume iconv does not modify its input. */ + char *check_input = input; + size_t check_input_left = input_left; + size_t check_output_left = input_left; + char *check_output_buffer, *check_output; + RECODE_OUTER outer = subtask->task->request->outer; + + if ((check_output = ALLOC (check_output_buffer, input_left, char)) != NULL) + { + size_t check_converted = iconv (check_conversion, + &check_input, &check_input_left, + &check_output, &check_output_left); + + if (check_converted != (size_t) -1) + recode_error = RECODE_UNTRANSLATABLE; + + free (check_output_buffer); + } + } + + /* Invalid or untranslatable input. Skip one byte. */ + /* FIXME: We cannot tell how many bytes to skip for + untranslatable input. The likely result is that we'll + get an "invalid input" error on the next step. */ + RETURN_IF_NOGO (recode_error, subtask); assert (input_left > 0); input++; input_left--; diff --git a/tests/t80_error.py b/tests/t80_error.py index 44c53e2..aa0b284 100644 --- a/tests/t80_error.py +++ b/tests/t80_error.py @@ -1,13 +1,23 @@ # -*- coding: utf-8 -*- import common -from common import setup_module, teardown_module, Recode, outer +from common import setup_module, teardown_module, Recode, outer, outer_iconv class Test: - def test_1(self): # Ensure correct error code returned for invalid input - request = Recode.Request(outer) # FIXME: Does not work with iconv (outer_iconv): Debian bug #348909 + def test_1(self): # Ensure correct error code returned for untranslatable input + request = Recode.Request(outer) request.scan('utf-8..latin1') task = Recode.Task(request) task.set_input("\303\241 \303\247 \316\261 \316\266") task.perform() assert(task.get_error() == Recode.UNTRANSLATABLE) + + # FIXME: Does not work with iconv for abort_level > UNTRANSLATABLE: Debian bug #348909 + def test_2(self): # Ensure correct error code returned for untranslatable input (with iconv) + request = Recode.Request(outer_iconv) + request.scan('utf-8..latin1') + task = Recode.Task(request) + task.set_input("\303\241 \303\247 \316\261 \316\266") + task.set_abort_level(Recode.UNTRANSLATABLE) + task.perform() + assert(task.get_error() == Recode.UNTRANSLATABLE) -- 2.40.0