2 * Copyright (C) 1996-2000,2010 Michael R. Elkins <me@mutt.org>
3 * Copyright (C) 2000-2002 Edmund Grimley Evans <edmundo@rano.org>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
35 /* If you are debugging this file, comment out the following line. */
44 #define ENCWORD_LEN_MAX 75
45 #define ENCWORD_LEN_MIN 9 /* strlen ("=?.?.?.?=") */
47 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
49 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
51 extern char RFC822Specials[];
53 typedef size_t (*encoder_t) (char *, ICONV_CONST char *, size_t,
56 static size_t convert_string (ICONV_CONST char *f, size_t flen,
57 const char *from, const char *to,
58 char **t, size_t *tlen)
65 cd = mutt_iconv_open (to, from, 0);
66 if (cd == (iconv_t)(-1))
69 ob = buf = safe_malloc (obl);
70 n = iconv (cd, &f, &flen, &ob, &obl);
71 if (n == (size_t)(-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
83 safe_realloc (&buf, ob - buf + 1);
90 int convert_nonmime_string (char **ps)
94 for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0)
100 size_t ulen = mutt_strlen (*ps);
106 c1 = strchr (c, ':');
107 n = c1 ? c1 - c : mutt_strlen (c);
110 fromcode = safe_malloc (n + 1);
111 strfcpy (fromcode, c, n + 1);
112 m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
114 if (m != (size_t)(-1))
116 FREE (ps); /* __FREE_CHECKED__ */
121 mutt_convert_string (ps,
122 (const char *)mutt_get_default_charset (),
123 Charset, MUTT_ICONV_HOOK_FROM);
127 char *mutt_choose_charset (const char *fromcode, const char *charsets,
128 char *u, size_t ulen, char **d, size_t *dlen)
130 char canonical_buff[LONG_STRING];
131 char *e = 0, *tocode = 0;
132 size_t elen = 0, bestn = 0;
135 for (p = charsets; p; p = q ? q + 1 : 0)
142 n = q ? q - p : strlen (p);
146 t = safe_malloc (n + 1);
150 n = convert_string (u, ulen, fromcode, t, &s, &slen);
151 if (n == (size_t)(-1))
157 if (!tocode || n < bestn)
186 mutt_canonical_charset (canonical_buff, sizeof (canonical_buff), tocode);
187 mutt_str_replace (&tocode, canonical_buff);
192 static size_t b_encoder (char *s, ICONV_CONST char *d, size_t dlen,
197 memcpy (s, "=?", 2), s += 2;
198 memcpy (s, tocode, strlen (tocode)), s += strlen (tocode);
199 memcpy (s, "?B?", 3), s += 3;
206 *s++ = B64Chars[(*d >> 2) & 0x3f];
207 *s++ = B64Chars[(*d & 0x03) << 4];
214 *s++ = B64Chars[(*d >> 2) & 0x3f];
215 *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
216 *s++ = B64Chars[(d[1] & 0x0f) << 2];
222 *s++ = B64Chars[(*d >> 2) & 0x3f];
223 *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
224 *s++ = B64Chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
225 *s++ = B64Chars[d[2] & 0x3f];
229 memcpy (s, "?=", 2), s += 2;
233 static size_t q_encoder (char *s, ICONV_CONST char *d, size_t dlen,
236 static const char hex[] = "0123456789ABCDEF";
239 memcpy (s, "=?", 2), s += 2;
240 memcpy (s, tocode, strlen (tocode)), s += strlen (tocode);
241 memcpy (s, "?Q?", 3), s += 3;
244 unsigned char c = *d++;
247 else if (c >= 0x7f || c < 0x20 || c == '_' || strchr (MimeSpecials, c))
250 *s++ = hex[(c & 0xf0) >> 4];
251 *s++ = hex[c & 0x0f];
256 memcpy (s, "?=", 2), s += 2;
261 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
262 * be converted to an encoded word of length *wlen using *encoder.
263 * Otherwise return an upper bound on the maximum length of the data
264 * which could be converted.
265 * The data is converted from fromcode (which must be stateless) to
266 * tocode, unless fromcode is 0, in which case the data is assumed to
267 * be already in tocode, which should be 8-bit and stateless.
269 static size_t try_block (ICONV_CONST char *d, size_t dlen,
270 const char *fromcode, const char *tocode,
271 encoder_t *encoder, size_t *wlen)
273 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
275 ICONV_CONST char *ib;
278 int count, len, len_b, len_q;
282 cd = mutt_iconv_open (tocode, fromcode, 0);
283 assert (cd != (iconv_t)(-1));
284 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - strlen (tocode);
285 if (iconv (cd, &ib, &ibl, &ob, &obl) == (size_t)(-1) ||
286 iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
288 assert (errno == E2BIG);
291 return (ib - d == dlen) ? dlen : ib - d + 1;
297 if (dlen > sizeof (buf1) - strlen (tocode))
298 return sizeof (buf1) - strlen (tocode) + 1;
299 memcpy (buf1, d, dlen);
304 for (p = buf1; p < ob; p++)
306 unsigned char c = *p;
307 assert (strchr (MimeSpecials, '?'));
308 if (c >= 0x7f || c < 0x20 || *p == '_' ||
309 (c != ' ' && strchr (MimeSpecials, *p)))
313 len = ENCWORD_LEN_MIN - 2 + strlen (tocode);
314 len_b = len + (((ob - buf1) + 2) / 3) * 4;
315 len_q = len + (ob - buf1) + 2 * count;
317 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
318 if (!ascii_strcasecmp (tocode, "ISO-2022-JP"))
319 len_q = ENCWORD_LEN_MAX + 1;
321 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX)
323 *encoder = b_encoder;
327 else if (len_q <= ENCWORD_LEN_MAX)
329 *encoder = q_encoder;
338 * Encode the data (d, dlen) into s using the encoder.
339 * Return the length of the encoded word.
341 static size_t encode_block (char *s, char *d, size_t dlen,
342 const char *fromcode, const char *tocode,
345 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
347 ICONV_CONST char *ib;
349 size_t ibl, obl, n1, n2;
353 cd = mutt_iconv_open (tocode, fromcode, 0);
354 assert (cd != (iconv_t)(-1));
355 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - strlen (tocode);
356 n1 = iconv (cd, &ib, &ibl, &ob, &obl);
357 n2 = iconv (cd, 0, 0, &ob, &obl);
358 assert (n1 != (size_t)(-1) && n2 != (size_t)(-1));
360 return (*encoder) (s, buf1, ob - buf1, tocode);
363 return (*encoder) (s, d, dlen, tocode);
367 * Discover how much of the data (d, dlen) can be converted into
368 * a single encoded word. Return how much data can be converted,
369 * and set the length *wlen of the encoded word and *encoder.
370 * We start in column col, which limits the length of the word.
372 static size_t choose_block (char *d, size_t dlen, int col,
373 const char *fromcode, const char *tocode,
374 encoder_t *encoder, size_t *wlen)
377 int utf8 = fromcode && !ascii_strcasecmp (fromcode, "utf-8");
383 nn = try_block (d, n, fromcode, tocode, encoder, wlen);
384 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
386 n = (nn ? nn : n) - 1;
389 while (n > 1 && CONTINUATION_BYTE(d[n]))
396 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
397 * allocated buffer (e, elen). The input data is in charset fromcode
398 * and is converted into a charset chosen from charsets.
399 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
400 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
401 * compatible with us-ascii and the original data is used.
402 * The input data is assumed to be a single line starting at column col;
403 * if col is non-zero, the preceding character was a space.
405 static int rfc2047_encode (ICONV_CONST char *d, size_t dlen, int col,
406 const char *fromcode, const char *charsets,
407 char **e, size_t *elen, char *specials)
411 size_t bufpos, buflen;
412 char *u = NULL, *t0, *t1, *t;
414 size_t ulen, r, n, wlen;
418 char *icode = "utf-8";
420 /* Try to convert to UTF-8. */
421 if (convert_string (d, dlen, fromcode, icode, &u, &ulen))
425 safe_realloc (&u, (ulen = dlen) + 1);
430 /* Find earliest and latest things we must encode. */
431 s0 = s1 = t0 = t1 = 0;
432 for (t = u; t < u + ulen; t++)
435 (*t == '=' && t[1] == '?' && (t == u || HSPACE(*(t-1)))))
440 else if (specials && *t && strchr (specials, *t))
447 /* If we have something to encode, include RFC822 specials */
448 if (t0 && s0 && s0 < t0)
450 if (t1 && s1 && s1 > t1)
455 /* No encoding is required. */
461 /* Choose target charset. */
465 if ((tocode1 = mutt_choose_charset (icode, charsets, u, ulen, 0, 0)))
471 /* Hack to avoid labelling 8-bit data as us-ascii. */
472 if (!icode && mutt_is_us_ascii (tocode))
473 tocode = "unknown-8bit";
475 /* Adjust t0 for maximum length of line. */
476 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
481 /* Adjust t0 until we can encode a character after a space. */
484 if (!HSPACE(*(t0-1)))
488 while (t < u + ulen && CONTINUATION_BYTE(*t))
490 if (!try_block (t0, t - t0, icode, tocode, &encoder, &wlen) &&
491 col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
495 /* Adjust t1 until we can encode a character before a space. */
496 for (; t1 < u + ulen; t1++)
502 while (CONTINUATION_BYTE(*t))
504 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen) &&
505 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
509 /* We shall encode the region [t0,t1). */
511 /* Initialise the output buffer with the us-ascii prefix. */
513 buf = safe_malloc (buflen);
515 memcpy (buf, u, t0 - u);
522 /* Find how much we can encode. */
523 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
526 /* See if we can fit the us-ascii suffix, too. */
527 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
531 while (CONTINUATION_BYTE(t[n]))
536 /* This should only happen in the really stupid case where the
537 only word that needs encoding is one character long, but
538 there is too much us-ascii stuff after it to use a single
539 encoded word. We add the next word to the encoded region
541 assert (t1 < u + ulen);
542 for (t1++; t1 < u + ulen && !HSPACE(*t1); t1++)
546 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
549 /* Add to output buffer. */
550 #define LINEBREAK "\n\t"
551 if (bufpos + wlen + strlen (LINEBREAK) > buflen)
553 buflen = bufpos + wlen + strlen (LINEBREAK);
554 safe_realloc (&buf, buflen);
556 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
559 memcpy (buf + bufpos, LINEBREAK, strlen (LINEBREAK));
560 bufpos += strlen (LINEBREAK);
568 /* Add last encoded word and us-ascii suffix to buffer. */
569 buflen = bufpos + wlen + (u + ulen - t1);
570 safe_realloc (&buf, buflen + 1);
571 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
574 memcpy (buf + bufpos, t1, u + ulen - t1);
586 void _rfc2047_encode_string (char **pd, int encode_specials, int col)
592 if (!Charset || !*pd)
595 charsets = SendCharset;
599 rfc2047_encode (*pd, strlen (*pd), col,
600 Charset, charsets, &e, &elen,
601 encode_specials ? RFC822Specials : NULL);
603 FREE (pd); /* __FREE_CHECKED__ */
607 void rfc2047_encode_adrlist (ADDRESS *addr, const char *tag)
610 int col = tag ? strlen (tag) + 2 : 32;
615 _rfc2047_encode_string (&ptr->personal, 1, col);
616 else if (ptr->group && ptr->mailbox)
617 _rfc2047_encode_string (&ptr->mailbox, 1, col);
620 _rfc2047_encode_string (&ptr->val, 1, col);
626 void rfc2047_encode_envelope (ENVELOPE *e)
628 rfc2047_encode_adrlist (e->from, "From");
629 rfc2047_encode_adrlist (e->to, "To");
630 rfc2047_encode_adrlist (e->cc, "Cc");
631 rfc2047_encode_adrlist (e->bcc, "Bcc");
632 rfc2047_encode_adrlist (e->reply_to, "Reply-To");
633 rfc2047_encode_adrlist (e->mail_followup_to, "Mail-Followup-To");
634 rfc2047_encode_adrlist (e->sender, "Sender");
635 rfc2047_encode_string (&e->x_label);
636 rfc2047_encode_string (&e->subject);
639 static int rfc2047_decode_word (BUFFER *d, const char *s, char **charset)
641 const char *pp, *pp1;
644 int enc = 0, count = 0;
647 pd = d0 = safe_malloc (strlen (s));
649 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1)
653 /* hack for non-compliant MUAs that allow unquoted question marks in encoded-text */
656 while (pp1 && *(pp1 + 1) != '=')
657 pp1 = strchr(pp1 + 1, '?');
665 /* ignore language specification a la RFC 2231 */
667 if ((t1 = memchr (pp, '*', t - pp)))
669 *charset = mutt_substrdup (pp, t);
672 if (toupper ((unsigned char) *pp) == 'Q')
673 enc = ENCQUOTEDPRINTABLE;
674 else if (toupper ((unsigned char) *pp) == 'B')
680 if (enc == ENCQUOTEDPRINTABLE)
682 for (; pp < pp1; pp++)
686 else if (*pp == '=' &&
687 (!(pp[1] & ~127) && hexval(pp[1]) != -1) &&
688 (!(pp[2] & ~127) && hexval(pp[2]) != -1))
690 *pd++ = (hexval(pp[1]) << 4) | hexval(pp[2]);
698 else if (enc == ENCBASE64)
702 for (; pp < pp1; pp++)
706 if ((*pp & ~127) || (c = base64val(*pp)) == -1)
711 *pd++ = b | (c >> k);
726 mutt_buffer_addstr (d, d0);
734 * Find the start and end of the first encoded word in the string.
735 * We use the grammar in section 2 of RFC 2047, but the "encoding"
736 * must be B or Q. Also, we don't require the encoded word to be
737 * separated by linear-white-space (section 5(1)).
739 static const char *find_encoded_word (const char *s, const char **x)
744 while ((p = strstr (q, "=?")))
747 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q);
750 if (q[0] != '?' || q[1] == '\0' || !strchr ("BbQq", q[1]) || q[2] != '?')
752 /* non-strict check since many MUAs will not encode spaces and question marks */
753 for (q = q + 3; 0x20 <= *q && *q < 0x7f && (*q != '?' || q[1] != '='); q++)
755 if (q[0] != '?' || q[1] != '=')
768 /* return length of linear-white-space */
769 static size_t lwslen (const char *s, size_t n)
777 for (; p < s + n; p++)
778 if (!strchr (" \t\r\n", *p))
780 len = (size_t)(p - s);
783 if (strchr ("\r\n", *(p-1))) /* LWS doesn't end with CRLF */
788 /* return length of linear-white-space : reverse */
789 static size_t lwsrlen (const char *s, size_t n)
791 const char *p = s + n - 1;
797 if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
801 if (!strchr (" \t\r\n", *p))
803 len = (size_t)(s + n - 1 - p);
809 static void convert_and_add_text (BUFFER *d, const char *text, size_t len)
815 t = safe_malloc (len + 1);
816 strfcpy (t, text, len + 1);
817 convert_nonmime_string (&t);
818 mutt_buffer_addstr (d, t);
822 mutt_buffer_addstr_n (d, text, len);
825 static void convert_and_add_word (BUFFER *d, BUFFER *word, char **charset)
829 t = safe_strdup (mutt_b2s (word));
834 mutt_convert_string (&t, *charset, Charset, MUTT_ICONV_HOOK_FROM);
836 mutt_filter_unprintable (&t);
837 mutt_buffer_addstr (d, t);
841 mutt_buffer_clear (word);
842 FREE (charset); /* __FREE_CHECKED__ */
845 /* try to decode anything that looks like a valid RFC2047 encoded
846 * header field, ignoring RFC822 parsing rules
848 void rfc2047_decode (char **pd)
851 const char *word_begin, *word_end;
852 char *word_charset = NULL, *accumulated_charset = NULL;
854 int found_encoded = 0, rc;
855 BUFFER *d, *word, *accumulated_word;
860 d = mutt_buffer_pool_get ();
861 word = mutt_buffer_pool_get ();
862 accumulated_word = mutt_buffer_pool_get ();
864 while ((word_begin = find_encoded_word (s, &word_end)) != NULL)
866 /* If there is text before the encoded word */
869 n = (size_t) (word_begin - s);
871 if (!found_encoded || ((strspn (s, " \t\r\n") != n)))
873 convert_and_add_word (d, accumulated_word, &accumulated_charset);
875 if (option (OPTIGNORELWS))
877 if (found_encoded && (m = lwslen (s, n)) != 0)
880 mutt_buffer_addch (d, ' ');
884 if ((m = n - lwsrlen (s, n)) != 0)
886 convert_and_add_text (d, s, m);
888 mutt_buffer_addch (d, ' ');
892 convert_and_add_text (d, s, n);
896 rc = rfc2047_decode_word (word, word_begin, &word_charset);
898 /* If the decode failed, or it's a different charset, write out
899 * the accumulated part. */
901 (ascii_strcasecmp (accumulated_charset, word_charset) != 0))
903 convert_and_add_word (d, accumulated_word, &accumulated_charset);
906 /* If the decode failed, write out the raw string. */
909 mutt_buffer_addstr_n (d, word_begin, word_end - word_begin);
911 /* Otherwise save it to be compared to the next word's charset */
914 mutt_buffer_addstr (accumulated_word, mutt_b2s (word));
915 mutt_str_replace (&accumulated_charset, word_charset);
918 mutt_buffer_clear (word);
919 FREE (&word_charset);
924 convert_and_add_word (d, accumulated_word, &accumulated_charset);
928 if (found_encoded && option (OPTIGNORELWS))
931 if ((m = lwslen (s, n)) != 0)
934 mutt_buffer_addch (d, ' ');
938 convert_and_add_text (d, s, mutt_strlen (s));
941 mutt_str_replace (pd, mutt_b2s (d));
943 mutt_buffer_pool_release (&d);
944 mutt_buffer_pool_release (&word);
945 mutt_buffer_pool_release (&accumulated_word);
948 void rfc2047_decode_adrlist (ADDRESS *a)
952 if (a->personal && ((strstr (a->personal, "=?") != NULL) ||
954 rfc2047_decode (&a->personal);
955 else if (a->group && a->mailbox && (strstr (a->mailbox, "=?") != NULL))
956 rfc2047_decode (&a->mailbox);
958 if (a->val && strstr (a->val, "=?") != NULL)
959 rfc2047_decode (&a->val);
965 void rfc2047_decode_envelope (ENVELOPE *e)
967 rfc2047_decode_adrlist (e->from);
968 rfc2047_decode_adrlist (e->to);
969 rfc2047_decode_adrlist (e->cc);
970 rfc2047_decode_adrlist (e->bcc);
971 rfc2047_decode_adrlist (e->reply_to);
972 rfc2047_decode_adrlist (e->mail_followup_to);
973 rfc2047_decode_adrlist (e->return_path);
974 rfc2047_decode_adrlist (e->sender);
975 rfc2047_decode (&e->x_label);
976 rfc2047_decode (&e->subject);