From: Kevin McCarthy <kevin@8t8.us>
Date: Mon, 3 Dec 2018 05:39:55 +0000 (-0800)
Subject: Improve rfc2047_decode to deal with improperly split words.
X-Git-Tag: mutt-1-12-rel~199
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=26bba6f9f6d7d754b6d4731feb8b306c3db7adf1;p=mutt

Improve rfc2047_decode to deal with improperly split words.

Some non-compliant MUAs split rfc2047 words inside a multibyte
character.

Deal with this by accumulating decoded words sharing the same
character set, and then perform character conversion all at once.

It is not clear to me that the $ignore_linear_white_space option is
functional or properly coded, but I've gone through the effort of
trying to preserve its calls.
---

diff --git a/rfc2047.c b/rfc2047.c
index 5c3574aa..ff5dc977 100644
--- a/rfc2047.c
+++ b/rfc2047.c
@@ -623,13 +623,12 @@ void rfc2047_encode_adrlist (ADDRESS *addr, const char *tag)
   }
 }
 
-static int rfc2047_decode_word (BUFFER *d, const char *s)
+static int rfc2047_decode_word (BUFFER *d, const char *s, char **charset)
 {
   const char *pp, *pp1;
   char *pd, *d0;
   const char *t, *t1;
   int enc = 0, count = 0;
-  char *charset = NULL;
   int rv = -1;
 
   pd = d0 = safe_malloc (strlen (s));
@@ -654,7 +653,7 @@ static int rfc2047_decode_word (BUFFER *d, const char *s)
 	t = pp1;
         if ((t1 = memchr (pp, '*', t - pp)))
 	  t = t1;
-	charset = mutt_substrdup (pp, t);
+	*charset = mutt_substrdup (pp, t);
 	break;
       case 3:
 	if (toupper ((unsigned char) *pp) == 'Q')
@@ -711,13 +710,9 @@ static int rfc2047_decode_word (BUFFER *d, const char *s)
     }
   }
   
-  if (charset)
-    mutt_convert_string (&d0, charset, Charset, MUTT_ICONV_HOOK_FROM);
-  mutt_filter_unprintable (&d0);
   mutt_buffer_addstr (d, d0);
   rv = 0;
 error_out_0:
-  FREE (&charset);
   FREE (&d0);
   return rv;
 }
@@ -814,6 +809,26 @@ static void convert_and_add_text (BUFFER *d, const char *text, size_t len)
     mutt_buffer_addstr_n (d, text, len);
 }
 
+static void convert_and_add_word (BUFFER *d, BUFFER *word, char **charset)
+{
+  char *t;
+
+  t = safe_strdup (mutt_b2s (word));
+  if (!t)
+    goto out;
+
+  if (*charset)
+    mutt_convert_string (&t, *charset, Charset, MUTT_ICONV_HOOK_FROM);
+
+  mutt_filter_unprintable (&t);
+  mutt_buffer_addstr (d, t);
+  FREE (&t);
+
+out:
+  mutt_buffer_clear (word);
+  FREE (charset);  /* __FREE_CHECKED__ */
+}
+
 /* try to decode anything that looks like a valid RFC2047 encoded
  * header field, ignoring RFC822 parsing rules
  */
@@ -821,16 +836,17 @@ void rfc2047_decode (char **pd)
 {
   const char *s = *pd;
   const char *word_begin, *word_end;
+  char *word_charset = NULL, *accumulated_charset = NULL;
   size_t m, n;
-  int found_encoded = 0;
-  BUFFER *d;
+  int found_encoded = 0, rc;
+  BUFFER *d, *word, *accumulated_word;
 
   if (!s || !*s)
     return;
 
-  dprint (1, (debugfile, "rfcdecode on *%s*\n", s));
-
   d = mutt_buffer_pool_get ();
+  word = mutt_buffer_pool_get ();
+  accumulated_word = mutt_buffer_pool_get ();
 
   while ((word_begin = find_encoded_word (s, &word_end)) != NULL)
   {
@@ -839,41 +855,61 @@ void rfc2047_decode (char **pd)
     {
       n = (size_t) (word_begin - s);
 
-      if (option (OPTIGNORELWS))
+      if (!found_encoded || ((strspn (s, " \t\r\n") != n)))
       {
-        if (found_encoded && (m = lwslen (s, n)) != 0)
-        {
-          if (m != n)
-            mutt_buffer_addch (d, ' ');
-          n -= m, s += m;
-        }
+        convert_and_add_word (d, accumulated_word, &accumulated_charset);
 
-        if ((m = n - lwsrlen (s, n)) != 0)
+        if (option (OPTIGNORELWS))
         {
-          convert_and_add_text (d, s, m);
-          if (m != n)
-            mutt_buffer_addch (d, ' ');
+          if (found_encoded && (m = lwslen (s, n)) != 0)
+          {
+            if (m != n)
+              mutt_buffer_addch (d, ' ');
+            n -= m, s += m;
+          }
+
+          if ((m = n - lwsrlen (s, n)) != 0)
+          {
+            convert_and_add_text (d, s, m);
+            if (m != n)
+              mutt_buffer_addch (d, ' ');
+          }
         }
+        else
+          convert_and_add_text (d, s, n);
       }
-      /* If we haven't encountered an encoded word yet copy it all
-       * over.
-       *
-       * If we just finished an encoded word and the text is all
-       * spaces, we skip the spaces.
-       */
-      else if (!found_encoded || strspn (s, " \t\r\n") != n)
-        convert_and_add_text (d, s, n);
     }
 
-    if (rfc2047_decode_word (d, word_begin) == -1)
+    rc = rfc2047_decode_word (word, word_begin, &word_charset);
+
+    /* If the decode failed, or it's a different charset, write out
+     * the accumulated part. */
+    if ((rc != 0) ||
+        (ascii_strcasecmp (accumulated_charset, word_charset) != 0))
+    {
+      convert_and_add_word (d, accumulated_word, &accumulated_charset);
+    }
+
+    /* If the decode failed, write out the raw string. */
+    if (rc != 0)
+    {
+      mutt_buffer_addstr_n (d, word_begin, word_end - word_begin);
+    }
+    /* Otherwise save it to be compared to the next word's charset */
+    else
     {
-      /* could not decode word, fall back to displaying the raw string */
-      mutt_buffer_addstr (d, word_begin);
+      mutt_buffer_addstr (accumulated_word, mutt_b2s (word));
+      mutt_str_replace (&accumulated_charset, word_charset);
     }
+
+    mutt_buffer_clear (word);
+    FREE (&word_charset);
     found_encoded = 1;
     s = word_end;
   }
 
+  convert_and_add_word (d, accumulated_word, &accumulated_charset);
+
   if (*s)
   {
     if (found_encoded && option (OPTIGNORELWS))
@@ -890,7 +926,10 @@ void rfc2047_decode (char **pd)
   }
 
   mutt_str_replace (pd, mutt_b2s (d));
+
   mutt_buffer_pool_release (&d);
+  mutt_buffer_pool_release (&word);
+  mutt_buffer_pool_release (&accumulated_word);
 }
 
 void rfc2047_decode_adrlist (ADDRESS *a)