From 10abd888afd65f128f3b4731ad17eb23700017f9 Mon Sep 17 00:00:00 2001 From: Thomas Roessler Date: Mon, 29 May 2000 22:34:12 +0000 Subject: [PATCH] Edmund's latest RFC2047 encoding update. --- rfc2047.c | 549 ++++++++++++++++++++++++++++++++++++------------------ rfc2047.h | 2 +- send.c | 4 +- sendlib.c | 23 ++- 4 files changed, 385 insertions(+), 193 deletions(-) diff --git a/rfc2047.c b/rfc2047.c index e44700b9..9e6880f2 100644 --- a/rfc2047.c +++ b/rfc2047.c @@ -1,5 +1,6 @@ /* * Copyright (C) 1996-2000 Michael R. Elkins + * Copyright (C) 2000 Edmund Grimley Evans * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,247 +23,437 @@ #include "rfc2047.h" #include +#include +#include +#include +#include #include -typedef void encode_t (char *, size_t, const unsigned char *, const char *); +/* If you are debugging this file, comment out the following line. */ +/*#define NDEBUG*/ +#ifdef NDEBUG +#define assert(x) +#else +#include +#endif -static void q_encode_string (char *d, - size_t dlen, - const unsigned char *s, - const char *send_charset) -{ - char charset[SHORT_STRING]; - size_t cslen, wordlen; - char *wptr = d; - const unsigned char *t; - int hibit = 0; +#define ENCWORD_LEN_MAX 75 +#define ENCWORD_OVERHEAD 7 /* strlen ("=??X??=") */ - for(t = s; *t && !hibit; t++) - if(*t & 0x80) hibit = 1; +#define HSPACE(x) ((x) == ' ' || (x) == '\t') - if(hibit || !mutt_strcasecmp (send_charset, "iso-2022-jp")) +typedef size_t (*encoder_t) (char *, const char *, size_t, + const char *); + +static size_t convert_string (const char *f, size_t flen, + const char *from, const char *to, + char **t, size_t *tlen) +{ + iconv_t cd; + char *buf, *ob, *x; + size_t obl, n; + int e; + + cd = iconv_open (to, from); + if (cd == (iconv_t)-1) + return -1; + obl = MB_LEN_MAX * flen; + ob = buf = safe_malloc (obl); + n = iconv (cd, &f, &flen, &ob, &obl); + if (n == -1 || iconv (cd, 0, 0, &ob, &obl) == -1) { - snprintf (charset, sizeof (charset), "=?%s?Q?", - mutt_strcasecmp ("us-ascii", send_charset) == 0 ? "unknown-8bit" : NONULL(send_charset)); + e = errno; + free (buf); + iconv_close (cd); + errno = e; + return -1; } - else - strfcpy(charset, "=?us-ascii?Q?", sizeof(charset)); + x = realloc (buf, ob - buf); + *t = x ? x : buf; + *tlen = ob - buf; + iconv_close (cd); + return n; +} + +static char *choose_charset (const char *charsets, char *u, size_t ulen) +{ + char *tocode = 0; + size_t bestn = 0; + const char *p, *q; - cslen = mutt_strlen (charset); + for (p = charsets; p; p = q ? q + 1 : 0) + { + char *s, *t; + size_t slen, n; - strcpy (wptr, charset); - wptr += cslen; - wordlen = cslen; - dlen -= cslen; + q = strchr (p, ':'); - dlen -= 3; /* save room for the word terminator */ + n = q ? q - p : strlen (p); - while (*s && dlen > 0) - { - if (wordlen >= 72) + if (!n || + n > (ENCWORD_LEN_MAX - ENCWORD_OVERHEAD - ((MB_LEN_MAX + 2) / 3) * 4)) + continue; + + t = safe_malloc (n + 1); + memcpy (t, p, n), t[n] = '\0'; + n = convert_string (u, ulen, "UTF-8", t, &s, &slen); + if (n == (size_t)(-1)) + continue; + free (s); + if (!tocode || n < bestn) { - if (dlen < 4 + cslen) + free (tocode), tocode = t, bestn = n; + if (!bestn) break; - - strcpy (wptr, "?=\n "); - wptr += 4; - dlen -= 4; - strcpy (wptr, charset); - wptr += cslen; - wordlen = cslen; - dlen -= cslen; } + else + free (t); + } + return tocode; +} - if (*s == ' ') +static size_t b_encoder (char *s, const char *d, size_t dlen, + const char *tocode) +{ + char *s0 = s; + + memcpy (s, "=?", 2), s += 2; + memcpy (s, tocode, strlen (tocode)), s += strlen (tocode); + memcpy (s, "?B?", 3), s += 3; + for (;;) + { + if (!dlen) + break; + else if (dlen == 1) { - *wptr++ = '_'; - wordlen++; - dlen--; + *s++ = B64Chars[(*d >> 2) & 0x3f]; + *s++ = B64Chars[(*d & 0x03) << 4]; + *s++ = '='; + *s++ = '='; + break; } - else if ((*s & 0x80) || *s == '\t' || *s == '_' || strchr (MimeSpecials, *s)) + else if (dlen == 2) { - if (wordlen >= 70) - { - if (dlen < 4 + cslen) - break; - - strcpy (wptr, "?=\n "); - wptr += 4; - dlen -= 4; - - strcpy (wptr, charset); - wptr += cslen; - wordlen = cslen; - dlen -= cslen; - } - - if (dlen < 3) - break; - sprintf (wptr, "=%02X", *s); - wptr += 3; - wordlen += 3; - dlen -= 3; + *s++ = B64Chars[(*d >> 2) & 0x3f]; + *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)]; + *s++ = B64Chars[(d[1] & 0x0f) << 2]; + *s++ = '='; + break; } else { - *wptr++ = *s; - wordlen++; - dlen--; + *s++ = B64Chars[(*d >> 2) & 0x3f]; + *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)]; + *s++ = B64Chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)]; + *s++ = B64Chars[d[2] & 0x3f]; + d += 3, dlen -= 3; } - s++; } + memcpy (s, "?=", 2), s += 2; + return s - s0; +} - strcpy (wptr, "?="); +static size_t q_encoder (char *s, const char *d, size_t dlen, + const char *tocode) +{ + char hex[] = "0123456789ABCDEF"; + char *s0 = s; + + memcpy (s, "=?", 2), s += 2; + memcpy (s, tocode, strlen (tocode)), s += strlen (tocode); + memcpy (s, "?Q?", 3), s += 3; + while (dlen--) + { + unsigned char c = *d++; + if (c >= 0x7f || c < 0x20 || c == '_' || strchr (MimeSpecials, c)) + { + *s++ = '='; + *s++ = hex[(c & 0xf0) >> 4]; + *s++ = hex[c & 0x0f]; + } + else if (c == ' ') + *s++ = '_'; + else + *s++ = c; + } + memcpy (s, "?=", 2), s += 2; + return s - s0; } -static void b_encode_string (char *d, size_t dlen, - const unsigned char *s, - const char *send_charset) +/* + * Return 0 if and set *encoder and *wlen if the data (d, dlen) could + * be converted to an encoded word of length *wlen using *encoder. + * Otherwise return an upper bound on the maximum length of the data + * which could be converted. + */ +static size_t try_block (const char *d, size_t dlen, const char *tocode, + encoder_t *encoder, size_t *wlen) { - char charset[SHORT_STRING]; - char *wptr = d; - int cslen; - int wordlen; + char buf1[ENCWORD_LEN_MAX - ENCWORD_OVERHEAD]; + iconv_t cd; + const char *ib; + char *ob, *p; + size_t ibl, obl; + int count, len, len_b, len_q; + + cd = iconv_open (tocode, "UTF-8"); + assert (cd != (iconv_t)(-1)); + ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - strlen (tocode); + if (iconv (cd, &ib, &ibl, &ob, &obl) == (size_t)(-1) || + iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1)) + { + assert (errno == E2BIG); + iconv_close (cd); + assert (ib > d); + return (ib - d == dlen) ? dlen : ib - d + 1; + } + iconv_close (cd); - snprintf (charset, sizeof (charset), "=?%s?B?", NONULL(send_charset)); - cslen = mutt_strlen (charset); - strcpy (wptr, charset); - wptr += cslen; - wordlen = cslen; - dlen -= cslen; + count = 0; + for (p = buf1; p < ob; p++) + { + unsigned char c = *p; + assert (strchr (MimeSpecials, '?')); + if (c >= 0x7f || c < 0x20 || *p == '_' || strchr (MimeSpecials, *p)) + ++count; + } - dlen -= 3; /* save room for the word terminator */ + len = strlen (tocode) + ENCWORD_OVERHEAD; + len_b = len + (((ob - buf1) + 2) / 3) * 4; + len_q = len + (ob - buf1) + 2 * count; - while (*s && dlen >= 4) + /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */ + if (!strcasecmp (tocode, "ISO-2022-JP")) + len_q = ENCWORD_LEN_MAX + 1; + + if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) { - if (wordlen >= 71) - { - if (dlen < 4 + cslen) - break; + *encoder = b_encoder; + *wlen = len_b; + return 0; + } + else if (len_q <= ENCWORD_LEN_MAX) + { + *encoder = q_encoder; + *wlen = len_q; + return 0; + } + else + return dlen; +} + +/* + * Encode the data (d, dlen) into s using the encoder. + * Return the length of the encoded word. + */ +static size_t encode_block (char *s, char *d, size_t dlen, + const char *tocode, encoder_t encoder) +{ + char buf1[ENCWORD_LEN_MAX - ENCWORD_OVERHEAD]; + iconv_t cd; + const char *ib; + char *ob; + size_t ibl, obl, n1, n2; + + cd = iconv_open (tocode, "UTF-8"); + assert (cd != (iconv_t)(-1)); + ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - strlen (tocode); + n1 = iconv (cd, &ib, &ibl, &ob, &obl); + n2 = iconv (cd, 0, 0, &ob, &obl); + assert (n1 != (size_t)(-1) && n2 != (size_t)(-1)); + iconv_close (cd); + return (*encoder) (s, buf1, ob - buf1, tocode); +} - strcpy (wptr, "?=\n "); - wptr += 4; - dlen -= 4; +/* + * Discover how much of the data (d, dlen) can be converted into + * a single encoded word. Return how much data can be converted, + * and set the length *wlen of the encoded word and *encoder. + */ +static size_t choose_block (char *d, size_t dlen, const char *tocode, + size_t *wlen, encoder_t *encoder) +{ + size_t n, nn; - strcpy (wptr, charset); - wptr += cslen; - wordlen = cslen; - dlen -= cslen; - } + n = dlen; + for (;;) + { + assert (n); + nn = try_block (d, n, tocode, encoder, wlen); + if (!nn) + break; + for (n = nn - 1; (d[n] & 0xc0) == 0x80; n--) + assert (n); + } + return n; +} - *wptr++ = B64Chars[ (*s >> 2) & 0x3f ]; - *wptr++ = B64Chars[ ((*s & 0x3) << 4) | ((*(s+1) >> 4) & 0xf) ]; - s++; - if (*s) - { - *wptr++ = B64Chars[ ((*s & 0xf) << 2) | ((*(s+1) >> 6) & 0x3) ]; - s++; - if (*s) - { - *wptr++ = B64Chars[ *s & 0x3f ]; - s++; - } - else - *wptr++ = '='; - } +/* + * Place the result of RFC-2048-encoding (d, dlen) into the dynamically + * allocated buffer (e, elen). The input data is in charset fromcode + * and is converted into a charset chosen from charsets. + * Return 1 if the input data is invalid, 2 if no conversion is possible, + * otherwise 0 on success. + */ +static int rfc2047_encode (const char *d, size_t dlen, + const char *fromcode, const char *charsets, + char **e, size_t *elen) +{ + char *buf = 0; + size_t bufpos = 0, buflen = 0; + char *u0, *u; + size_t ulen; + char *tocode; + int prev; + + /* Convert to UTF-8. */ + if (convert_string (d, dlen, fromcode, "UTF-8", &u, &ulen)) + return 1; + u0 = u; + + /* Choose target charset. */ + tocode = choose_charset (charsets, u, ulen); + if (!tocode) + { + free (u); + return 2; + } + + for (prev = 0; ulen; prev = 1) { + char *t; + size_t n, wlen, r; + encoder_t encoder; + + /* Decide where to start encoding. */ + if (prev && ulen && !HSPACE (*u)) + t = u; else { - *wptr++ = '='; - *wptr++ = '='; + /* Look for a non-us-ascii chararcter or "=?". */ + for (t = u; t < u + ulen - 1; t++) + if ((*t & 0x80) || (*t == '=' && t[1] == '?')) + break; + if (t == u + ulen - 1 && !(*t & 0x80)) + break; + + /* Find start of that word. */ + while (t > u && !HSPACE(*(t-1))) + --t; + if (prev) { + /* Include preceding characters if they are all spaces. */ + const char *x; + for (x = u; x < t && HSPACE(*x); x++) + ; + if (x >= t) + t = u; + } } - wordlen += 4; - dlen -= 4; + /* Convert some data and add encoded word to buffer. */ + n = choose_block (t, (u + ulen) - t, tocode, &wlen, &encoder); + buflen = bufpos + (t == u && prev) + (t - u) + wlen; + safe_realloc ((void **) &buf, buflen); + if (t == u && prev) + buf[bufpos++] = ' '; + memcpy (buf + bufpos, u, t - u); + bufpos += t - u; + r = encode_block (buf + bufpos, t, n, tocode, encoder); + assert (r == wlen); + bufpos += wlen; + n += t - u; + u += n; + ulen -= n; } - strcpy (wptr, "?="); + /* Add remaining us-ascii characters to buffer. */ + buflen = bufpos + ulen; + safe_realloc ((void **) &buf, buflen); + memcpy (buf + bufpos, u, ulen); + + free (tocode); + free (u0); + *e = buf; + *elen = buflen; + return 0; } -void rfc2047_encode_string (char *d, size_t dlen, const unsigned char *s) +#define MAX (ENCWORD_LEN_MAX + 1) + +static char *rfc2047_fold_line (char *e, size_t elen) { - int count = 0; - int len; - const unsigned char *p = s; - encode_t *encoder; - char send_charset[SHORT_STRING]; - char *scratch; - - mutt_get_send_charset(send_charset, sizeof(send_charset), NULL, 0); - - /* First check to see if there are any 8-bit characters */ - for (; *p; p++) + char *line, *p, *f; + int col = MAX; + + p = line = safe_malloc (elen * 2); /* more than enough */ + + while (elen) { - if (*p & 0x80) - count++; - else if (*p == '=' && *(p+1) == '?') + if (elen > 2 && e[1] == '=' && e[2] == '?' && HSPACE(*e)) { - count += 2; - p++; + again: + if (col + elen > MAX) + { + if (col >= MAX) + f = e; + else + for (f = e + MAX - col; !HSPACE(*f); f--) + ; + if (e == f) + { + if (col) + { + *p++ = '\n', col = 0; + goto again; + } + for (f = e + MAX; f < e + elen && !HSPACE(*f); f++) + ; + } + memcpy (p, e, f - e), p += f - e; + elen -= f - e, e = f; + if (elen) + *p++ = '\n', col = 0; + continue; + } } + *p++ = *e++, elen--, col++; } - if (!count) - { - strfcpy (d, (const char *)s, dlen); - return; - } + *p++ = '\0'; + safe_realloc ((void **) &line, p - line); + return line; +} - if (mutt_strcasecmp("us-ascii", send_charset) == 0 || - mutt_strncasecmp("iso-8859", send_charset, 8) == 0) - encoder = q_encode_string; - else - { - /* figure out which encoding generates the most compact representation */ - len = mutt_strlen ((char *) s); - if ((count * 2) + len <= (4 * len) / 3) - encoder = q_encode_string; - else - encoder = b_encode_string; - } +void rfc2047_encode_string (char **pd) +{ + char *e; + size_t elen; + char *charsets; - /* Hack to pull the Re: and Fwd: out of the encoded word for better - handling by agents which do not support RFC2047. */ - if (!mutt_strncasecmp ("re: ", (char *) s, 4)) - { - strncpy (d, (char *) s, 4); - d += 4; - dlen -= 4; - s += 4; - } - else if (!mutt_strncasecmp ("fwd: ", (char *) s, 5)) + charsets = SendCharset; + if (!charsets || !*charsets) + charsets = Charset; + if (!charsets || !*charsets) + charsets = "UTF-8"; + + if (!rfc2047_encode (*pd, strlen (*pd), Charset, charsets, &e, &elen)) { - strncpy (d, (char *) s, 5); - d += 5; - dlen -= 5; - s += 5; + free (*pd); + *pd = rfc2047_fold_line (e, elen); + free (e); } - - scratch = safe_strdup ((const char *) s); - if (*send_charset && mutt_strcasecmp("us-ascii", send_charset)) - mutt_convert_string (&scratch, Charset, send_charset); - - (*encoder) (d, dlen, (unsigned char *) scratch, send_charset); - safe_free ((void **) &scratch); } void rfc2047_encode_adrlist (ADDRESS *addr) { ADDRESS *ptr = addr; - char buffer[STRING]; while (ptr) { if (ptr->personal) - { - rfc2047_encode_string (buffer, sizeof (buffer), (const unsigned char *)ptr->personal); - mutt_str_replace (&ptr->personal, buffer); - } + rfc2047_encode_string (&ptr->personal); #ifdef EXACT_ADDRESS if (ptr->val) - { - rfc2047_encode_string (buffer, sizeof (buffer), (const unsigned char *)ptr->val); - mutt_str_replace (&ptr->val, buffer); - } + rfc2047_encode_string (&ptr->val); #endif ptr = ptr->next; } @@ -272,7 +463,7 @@ static int rfc2047_decode_word (char *d, const char *s, size_t len) { const char *pp = s, *pp1; char *pd, *d0; - char *t, *t1; + const char *t, *t1; int enc = 0, count = 0, c1, c2, c3, c4; char *charset = NULL; diff --git a/rfc2047.h b/rfc2047.h index 291d6566..ee0bc5ea 100644 --- a/rfc2047.h +++ b/rfc2047.h @@ -16,7 +16,7 @@ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. */ -void rfc2047_encode_string (char *, size_t, const unsigned char *); +void rfc2047_encode_string (char **); void rfc2047_encode_adrlist (ADDRESS *); void rfc2047_decode (char **); diff --git a/send.c b/send.c index 6f77afb1..d5f43a70 100644 --- a/send.c +++ b/send.c @@ -951,14 +951,12 @@ static int send_message (HEADER *msg) static void encode_descriptions (BODY *b, short recurse) { BODY *t; - char tmp[LONG_STRING]; for (t = b; t; t = t->next) { if (t->description) { - rfc2047_encode_string (tmp, sizeof (tmp), (unsigned char *) t->description); - mutt_str_replace (&t->description, tmp); + rfc2047_encode_string (&t->description); } if (recurse && t->parts) encode_descriptions (t->parts, recurse); diff --git a/sendlib.c b/sendlib.c index 4e93ab84..315ae3d6 100644 --- a/sendlib.c +++ b/sendlib.c @@ -920,7 +920,10 @@ char *mutt_get_send_charset (char *d, size_t dlen, BODY *b, short f) if (!p || (f && (!mutt_strcasecmp (p, "us-ascii") || !mutt_strcasecmp (p, "unknown-8bit")))) { if (SendCharset && *SendCharset) - p = SendCharset; + { + p = strrchr (SendCharset, ':'); + p = p ? p + 1 : SendCharset; + } else if (Charset) p = Charset; } @@ -1366,9 +1369,8 @@ int mutt_write_rfc822_header (FILE *fp, ENVELOPE *env, BODY *attach, static void encode_headers (LIST *h) { - char tmp[LONG_STRING]; + char *tmp; char *p; - size_t len; for (; h; h = h->next) { @@ -1376,10 +1378,13 @@ static void encode_headers (LIST *h) { *p++ = 0; SKIPWS (p); - snprintf (tmp, sizeof (tmp), "%s: ", h->data); - len = mutt_strlen (tmp); - rfc2047_encode_string (tmp + len, sizeof (tmp) - len, (unsigned char *) p); - mutt_str_replace (&h->data, tmp); + tmp = strdup (p); + rfc2047_encode_string (&tmp); + safe_realloc ((void **) &h->data, + strlen (h->data) + 2 + strlen (tmp) + 1); + strcat (h->data, ": "); + strcat (h->data, tmp); + free (tmp); } } } @@ -1813,9 +1818,7 @@ void mutt_prepare_envelope (ENVELOPE *env, int final) if (env->subject) { - rfc2047_encode_string (buffer, sizeof (buffer) - 1, - (unsigned char *) env->subject); - mutt_str_replace (&env->subject, buffer); + rfc2047_encode_string (&env->subject); } encode_headers (env->userhdrs); } -- 2.40.0