Try to diagnose untranslatable input when using iconv

author Reuben Thomas <rrt@sc3d.org>

Wed, 17 Jan 2018 22:43:10 +0000 (22:43 +0000)

committer Reuben Thomas <rrt@sc3d.org>

Tue, 23 Jan 2018 07:02:42 +0000 (07:02 +0000)
author Reuben Thomas <rrt@sc3d.org>
Wed, 17 Jan 2018 22:43:10 +0000 (22:43 +0000)
committer Reuben Thomas <rrt@sc3d.org>
Tue, 23 Jan 2018 07:02:42 +0000 (07:02 +0000)
diff --git a/doc/recode.texi b/doc/recode.texi

index 1f0367fc29fa84f2c366741dba27ee318a456167..69038f0544e6f59020a75846893a6e48239ee453 100644 (file)
--- a/doc/recode.texi
+++ b/doc/recode.texi
@@ -2568,6 +2568,13 @@ mode prohibits such attribution of reversible translations: so strict
  mode might often trigger such an error.  Most @code{UCS-2} codes used to
  represent Asian characters cannot be expressed in various Latin charsets.
  
+Since iconv does not distinguish untranslatable from invalid input,
+Recode has to use a workaround to detect when input is untranslatable.
+Unfortunately, it cannot currently tell how much input is untranslatable,
+so it cannot reliably skip such input: typically the input is then diagnosed
+as invalid.  Two possible workarounds are to set the @code{abort_level}
+to @code{RECODE_UNTRANSLATABLE}, or not to use iconv.
+
  @item RECODE_INVALID_INPUT
  @vindex RECODE_INVALID_INPUT
  @cindex invalid input, error message
diff --git a/src/iconv.c b/src/iconv.c

index d7fb9793be14417fd800a89ea6d09c96405ca601..c1b7a06c5036196c0da24d7d32dc14d98a528be3 100644 (file)
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -102,8 +102,40 @@ wrapped_transform (iconv_t conversion, RECODE_SUBTASK subtask)
         {
           if (saved_errno == EILSEQ)
             {
-             /* Invalid input.  Skip one byte.  */
-             RETURN_IF_NOGO (RECODE_INVALID_INPUT, subtask);
+             /* Check whether the input was really just untranslatable.  */
+              enum recode_error recode_error = RECODE_INVALID_INPUT;
+             RECODE_CONST_STEP step = subtask->step;
+             iconv_t check_conversion = iconv_open (step->before->iconv_name,
+                                                    step->before->iconv_name);
+
+             /* On error, give up and assume input is invalid.  */
+             if (input_left > 0 && check_conversion != (iconv_t) -1)
+               {
+                  /* Assume iconv does not modify its input.  */
+                 char *check_input = input;
+                 size_t check_input_left = input_left;
+                  size_t check_output_left = input_left;
+                 char *check_output_buffer, *check_output;
+                  RECODE_OUTER outer = subtask->task->request->outer;
+
+                  if ((check_output = ALLOC (check_output_buffer, input_left, char)) != NULL)
+                    {
+                      size_t check_converted = iconv (check_conversion,
+                                                      &check_input, &check_input_left,
+                                                      &check_output, &check_output_left);
+
+                      if (check_converted != (size_t) -1)
+                        recode_error = RECODE_UNTRANSLATABLE;
+
+                      free (check_output_buffer);
+                    }
+               }
+
+             /* Invalid or untranslatable input.  Skip one byte.  */
+              /* FIXME: We cannot tell how many bytes to skip for
+                 untranslatable input.  The likely result is that we'll
+                 get an "invalid input" error on the next step. */
+             RETURN_IF_NOGO (recode_error, subtask);
               assert (input_left > 0);
               input++;
               input_left--;
diff --git a/tests/t80_error.py b/tests/t80_error.py

index 44c53e262ab7a002682185653d36d893e084dce6..aa0b2849d9667b31b973fefacccacb9c33b7542d 100644 (file)
--- a/tests/t80_error.py
+++ b/tests/t80_error.py
@@ -1,13 +1,23 @@
  # -*- coding: utf-8 -*-
  import common
-from common import setup_module, teardown_module, Recode, outer
+from common import setup_module, teardown_module, Recode, outer, outer_iconv
  
  class Test:
  
-    def test_1(self): # Ensure correct error code returned for invalid input
-        request = Recode.Request(outer) # FIXME: Does not work with iconv (outer_iconv): Debian bug #348909
+    def test_1(self): # Ensure correct error code returned for untranslatable input
+        request = Recode.Request(outer)
          request.scan('utf-8..latin1')
          task = Recode.Task(request)
          task.set_input("\303\241 \303\247  \316\261 \316\266")
          task.perform()
          assert(task.get_error() == Recode.UNTRANSLATABLE)
+
+    # FIXME: Does not work with iconv for abort_level > UNTRANSLATABLE: Debian bug #348909
+    def test_2(self): # Ensure correct error code returned for untranslatable input (with iconv)
+        request = Recode.Request(outer_iconv)
+        request.scan('utf-8..latin1')
+        task = Recode.Task(request)
+        task.set_input("\303\241 \303\247  \316\261 \316\266")
+        task.set_abort_level(Recode.UNTRANSLATABLE)
+        task.perform()
+        assert(task.get_error() == Recode.UNTRANSLATABLE)
author	Reuben Thomas <rrt@sc3d.org>
	Wed, 17 Jan 2018 22:43:10 +0000 (22:43 +0000)
committer	Reuben Thomas <rrt@sc3d.org>
	Tue, 23 Jan 2018 07:02:42 +0000 (07:02 +0000)
doc/recode.texi		patch \| blob \| history
src/iconv.c		patch \| blob \| history
tests/t80_error.py		patch \| blob \| history