From: Unknown <> Date: Fri, 12 Oct 2007 21:14:15 +0000 (+0000) Subject: add files for 2007-10-12T21:14:15Z X-Git-Tag: imap-2007a1~54 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=4b7de2454e27c930504363fbe374f6fd20f9adec;p=uw-imap add files for 2007-10-12T21:14:15Z --- diff --git a/src/c-client/utf8aux.c b/src/c-client/utf8aux.c new file mode 100644 index 0000000..5138987 --- /dev/null +++ b/src/c-client/utf8aux.c @@ -0,0 +1,449 @@ +/* ======================================================================== + * Copyright 1988-2007 University of Washington + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * + * ======================================================================== + */ + +/* + * Program: UTF-8 auxillary routines (c-client and MIME2 support) + * + * Author: Mark Crispin + * Networks and Distributed Computing + * Computing & Communications + * University of Washington + * Administration Building, AG-44 + * Seattle, WA 98195 + * Internet: MRC@CAC.Washington.EDU + * + * Date: 11 June 1997 + * Last Edited: 12 October 2007 + */ + + +#include +#include +#include "c-client.h" + +/* Convert charset labelled stringlist to UTF-8 in place + * Accepts: string list + * charset + */ + +static void utf8_stringlist (STRINGLIST *st,char *charset) +{ + SIZEDTEXT txt; + /* convert entire stringstruct */ + if (st) do if (utf8_text (&st->text,charset,&txt,U8T_CANONICAL)) { + fs_give ((void **) &st->text.data); + st->text.data = txt.data; /* transfer this text */ + st->text.size = txt.size; + } while (st = st->next); +} + + +/* Convert charset labelled searchpgm to UTF-8 in place + * Accepts: search program + * charset + */ + +void utf8_searchpgm (SEARCHPGM *pgm,char *charset) +{ + SIZEDTEXT txt; + SEARCHHEADER *hl; + SEARCHOR *ol; + SEARCHPGMLIST *pl; + if (pgm) { /* must have a search program */ + utf8_stringlist (pgm->bcc,charset); + utf8_stringlist (pgm->cc,charset); + utf8_stringlist (pgm->from,charset); + utf8_stringlist (pgm->to,charset); + utf8_stringlist (pgm->subject,charset); + for (hl = pgm->header; hl; hl = hl->next) { + if (utf8_text (&hl->line,charset,&txt,U8T_CANONICAL)) { + fs_give ((void **) &hl->line.data); + hl->line.data = txt.data; + hl->line.size = txt.size; + } + if (utf8_text (&hl->text,charset,&txt,U8T_CANONICAL)) { + fs_give ((void **) &hl->text.data); + hl->text.data = txt.data; + hl->text.size = txt.size; + } + } + utf8_stringlist (pgm->body,charset); + utf8_stringlist (pgm->text,charset); + for (ol = pgm->or; ol; ol = ol->next) { + utf8_searchpgm (ol->first,charset); + utf8_searchpgm (ol->second,charset); + } + for (pl = pgm->not; pl; pl = pl->next) utf8_searchpgm (pl->pgm,charset); + utf8_stringlist (pgm->return_path,charset); + utf8_stringlist (pgm->sender,charset); + utf8_stringlist (pgm->reply_to,charset); + utf8_stringlist (pgm->in_reply_to,charset); + utf8_stringlist (pgm->message_id,charset); + utf8_stringlist (pgm->newsgroups,charset); + utf8_stringlist (pgm->followup_to,charset); + utf8_stringlist (pgm->references,charset); + } +} + +/* Convert MIME-2 sized text to UTF-8 + * Accepts: source sized text + * charset + * flags (same as utf8_text()) + * Returns: T if successful, NIL if failure + */ + +#define MINENCWORD 9 +#define MAXENCWORD 75 + +/* This resizing algorithm is stupid, but hopefully it should never be triggered + * except for a pathological header. The main concern is that we don't get a + * buffer overflow. + */ + +#define DSIZE 65536 /* real headers should never be this big */ +#define FUZZ 10 /* paranoia fuzz */ + +long utf8_mime2text (SIZEDTEXT *src,SIZEDTEXT *dst,long flags) +{ + unsigned char *s,*se,*e,*ee,*t,*te; + char *cs,*ce,*ls; + SIZEDTEXT txt,rtxt; + unsigned long i; + size_t dsize = min (DSIZE,((src->size / 4) + 1) * 9); + /* always create buffer if canonicalizing */ + dst->data = (flags & U8T_CANONICAL) ? + (unsigned char *) fs_get ((size_t) dsize) : NIL; + dst->size = 0; /* nothing written yet */ + /* look for encoded words */ + for (s = src->data, se = src->data + src->size; s < se; s++) { + if (((se - s) > MINENCWORD) && (*s == '=') && (s[1] == '?') && + (cs = (char *) mime2_token (s+2,se,(unsigned char **) &ce)) && + (e = mime2_token ((unsigned char *) ce+1,se,&ee)) && + (te = mime2_text (t = e+2,se)) && (ee == e + 1) && + ((te - s) < MAXENCWORD)) { + if (mime2_decode (e,t,te,&txt)) { + *ce = '\0'; /* temporarily tie off charset */ + if (ls = strchr (cs,'*')) *ls = '\0'; + /* convert to UTF-8 as best we can */ + if (!utf8_text (&txt,cs,&rtxt,flags)) utf8_text (&txt,NIL,&rtxt,flags); + if (dst->data) { /* make sure existing buffer fits */ + while (dsize <= (dst->size + rtxt.size + FUZZ)) { + dsize += DSIZE; /* kick it up */ + fs_resize ((void **) &dst->data,dsize); + } + } + else { /* make a new buffer */ + while (dsize <= (dst->size + rtxt.size)) dsize += DSIZE; + memcpy (dst->data = (unsigned char *) fs_get (dsize),src->data, + dst->size = s - src->data); + } + for (i = 0; i < rtxt.size; i++) dst->data[dst->size++] = rtxt.data[i]; + + /* all done with converted text */ + if (rtxt.data != txt.data) fs_give ((void **) &rtxt.data); + if (ls) *ls = '*'; /* restore language tag delimiter */ + *ce = '?'; /* restore charset delimiter */ + /* all done with decoded text */ + fs_give ((void **) &txt.data); + s = te+1; /* continue scan after encoded word */ + /* skip leading whitespace */ + for (t = s + 1; (t < se) && ((*t == ' ') || (*t == '\t')); t++); + /* see if likely continuation encoded word */ + if (t < (se - MINENCWORD)) switch (*t) { + case '=': /* possible encoded word? */ + if (t[1] == '?') s = t - 1; + break; + case '\015': /* CR, eat a following LF */ + if (t[1] == '\012') t++; + case '\012': /* possible end of logical line */ + if ((t[1] == ' ') || (t[1] == '\t')) { + do t++; + while ((t < (se - MINENCWORD)) && ((t[1] == ' ')||(t[1] == '\t'))); + if ((t < (se - MINENCWORD)) && (t[1] == '=') && (t[2] == '?')) + s = t; /* definitely looks like continuation */ + } + } + } + else { /* restore original text */ + if (dst->data) fs_give ((void **) &dst->data); + dst->data = src->data; + dst->size = src->size; + return NIL; /* syntax error: MIME-2 decoding failure */ + } + } + else do if (dst->data) { /* stash ASCII characters until LWSP */ + if (dsize < (dst->size + FUZZ)) { + dsize += DSIZE; /* kick it up */ + fs_resize ((void **) &dst->data,dsize); + } + /* kludge: assumes ASCII doesn't decompose and titlecases to one byte */ + dst->data[dst->size++] = (flags & U8T_CASECANON) ? + (unsigned char) ucs4_titlecase (*s) : *s; + } + while ((*s != ' ') && (*s != '\t') && (*s != '\015') && (*s != '\012') && + (++s < se)); + } + if (dst->data) dst->data[dst->size] = '\0'; + else { /* nothing converted, return identity */ + dst->data = src->data; + dst->size = src->size; + } + return T; /* success */ +} + +/* Decode MIME-2 text + * Accepts: Encoding + * text + * text end + * destination sized text + * Returns: T if successful, else NIL + */ + +long mime2_decode (unsigned char *e,unsigned char *t,unsigned char *te, + SIZEDTEXT *txt) +{ + unsigned char *q; + txt->data = NIL; /* initially no returned data */ + switch (*e) { /* dispatch based upon encoding */ + case 'Q': case 'q': /* sort-of QUOTED-PRINTABLE */ + txt->data = (unsigned char *) fs_get ((size_t) (te - t) + 1); + for (q = t,txt->size = 0; q < te; q++) switch (*q) { + case '=': /* quoted character */ + /* both must be hex */ + if (!isxdigit (q[1]) || !isxdigit (q[2])) { + fs_give ((void **) &txt->data); + return NIL; /* syntax error: bad quoted character */ + } + /* assemble character */ + txt->data[txt->size++] = hex2byte (q[1],q[2]); + q += 2; /* advance past quoted character */ + break; + case '_': /* convert to space */ + txt->data[txt->size++] = ' '; + break; + default: /* ordinary character */ + txt->data[txt->size++] = *q; + break; + } + txt->data[txt->size] = '\0'; + break; + case 'B': case 'b': /* BASE64 */ + if (txt->data = (unsigned char *) rfc822_base64 (t,te - t,&txt->size)) + break; + default: /* any other encoding is unknown */ + return NIL; /* syntax error: unknown encoding */ + } + return T; +} + +/* Get MIME-2 token from encoded word + * Accepts: current text pointer + * text limit pointer + * pointer to returned end pointer + * Returns: current text pointer & end pointer if success, else NIL + */ + +unsigned char *mime2_token (unsigned char *s,unsigned char *se, + unsigned char **t) +{ + for (*t = s; **t != '?'; ++*t) { + if ((*t < se) && isgraph (**t)) switch (**t) { + case '(': case ')': case '<': case '>': case '@': case ',': case ';': + case ':': case '\\': case '"': case '/': case '[': case ']': case '.': + case '=': + return NIL; /* none of these are valid in tokens */ + } + else return NIL; /* out of text or CTL or space */ + } + return s; +} + + +/* Get MIME-2 text from encoded word + * Accepts: current text pointer + * text limit pointer + * pointer to returned end pointer + * Returns: end pointer if success, else NIL + */ + +unsigned char *mime2_text (unsigned char *s,unsigned char *se) +{ + unsigned char *t = se - 1; + /* search for closing ?, make sure valid */ + while ((s < t) && (*s != '?') && isgraph (*s++)); + return ((s < t) && (*s == '?') && (s[1] == '=') && + ((se == (s + 2)) || (s[2] == ' ') || (s[2] == '\t') || + (s[2] == '\015') || (s[2] == '\012'))) ? s : NIL; +} + +/* Convert UTF-16 string to Modified Base64 + * Accepts: destination pointer + * source string + * source length in octets + * Returns: updated destination pointer + */ + +static unsigned char *utf16_to_mbase64 (unsigned char *t,unsigned char *s, + size_t i) +{ + char *v = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + *t++ = '&'; /* write shift-in */ + while (i >= 3) { /* process tuplets */ + *t++ = v[s[0] >> 2]; /* byte 1: high 6 bits (1) */ + /* byte 2: low 2 bits (1), high 4 bits (2) */ + *t++ = v[((s[0] << 4) + (s[1] >> 4)) & 0x3f]; + /* byte 3: low 4 bits (2), high 2 bits (3) */ + *t++ = v[((s[1] << 2) + (s[2] >> 6)) & 0x3f]; + *t++ = v[s[2] & 0x3f]; /* byte 4: low 6 bits (3) */ + s += 3; + i -= 3; + } + if (i) { + *t++ = v[s[0] >> 2]; /* byte 1: high 6 bits (1) */ + /* byte 2: low 2 bits (1), high 4 bits (2) */ + *t++ = v[((s[0] << 4) + (--i ? (s[1] >> 4) : 0)) & 0x3f]; + /* byte 3: low 4 bits (2) */ + if (i) *t++ = v[(s[1] << 2) & 0x3f]; + } + *t++ = '-'; /* write shift-out */ + return t; +} + + +/* Poot a UTF-16 value to a buffer + * Accepts: buffer pointer + * value + * Returns: updated pointer + */ + +static unsigned char *utf16_poot (unsigned char *s,unsigned long c) +{ + *s++ = (unsigned char) (c >> 8); + *s++ = (unsigned char) (c & 0xff); + return s; +} + +/* Convert UTF-8 to Modified UTF-7 + * Accepts: UTF-8 string + * Returns: Modified UTF-7 string on success, NIL if invalid UTF-8 + */ + +#define MAXUNIUTF8 4 /* maximum length of Unicode UTF-8 sequence */ + +unsigned char *utf8_to_mutf7 (unsigned char *src) +{ + unsigned char *u16buf,*utf16; + unsigned char *ret,*t; + unsigned long j,c; + unsigned char *s = src; + unsigned long i = 0; + int nonascii = 0; + while (*s) { /* pass one: count destination octets */ + if (*s & 0x80) { /* non-ASCII character? */ + j = MAXUNIUTF8; /* get single UCS-4 codepoint */ + if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL; + /* tally number of UTF-16 octets */ + nonascii += (c & U8GM_NONBMP) ? 4 : 2; + } + else { /* ASCII character */ + if (nonascii) { /* add pending Modified BASE64 size + shifts */ + i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2; + nonascii = 0; /* back to ASCII */ + } + if (*s == '&') i += 2; /* two octets if the escape */ + else ++i; /* otherwise just count another octet */ + ++s; /* advance to next source octet */ + } + } + if (nonascii) /* add pending Modified BASE64 size + shifts */ + i += ((nonascii / 3) * 4) + ((j = nonascii % 3) ? j + 1 : 0) + 2; + + /* create return buffer */ + t = ret = (unsigned char *) fs_get (i + 1); + /* and scratch buffer */ + utf16 = u16buf = (unsigned char *) fs_get (i + 1); + for (s = src; *s;) { /* pass two: copy destination octets */ + if (*s & 0x80) { /* non-ASCII character? */ + j = MAXUNIUTF8; /* get single UCS-4 codepoint */ + if ((c = utf8_get (&s,&j)) & U8G_ERROR) return NIL; + if (c & U8GM_NONBMP) { /* non-BMP? */ + c -= UTF16_BASE; /* yes, convert to surrogate */ + utf16 = utf16_poot (utf16_poot (utf16,(c >> UTF16_SHIFT)+UTF16_SURRH), + (c & UTF16_MASK) + UTF16_SURRL); + } + else utf16 = utf16_poot (utf16,c); + } + else { /* ASCII character */ + if (utf16 != u16buf) { /* add pending Modified BASE64 size + shifts */ + t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf); + utf16 = u16buf; /* reset buffer */ + } + *t++ = *s; /* copy the character */ + if (*s == '&') *t++ = '-';/* special sequence if the escape */ + ++s; /* advance to next source octet */ + } + } + /* add pending Modified BASE64 size + shifts */ + if (utf16 != u16buf) t = utf16_to_mbase64 (t,u16buf,utf16 - u16buf); + *t = '\0'; /* tie off destination */ + if (i != (t - ret)) fatal ("utf8_to_mutf7 botch"); + fs_give ((void **) &u16buf); + return ret; +} + +/* Convert Modified UTF-7 to UTF-8 + * Accepts: Modified UTF-7 string + * Returns: UTF-8 string on success, NIL if invalid Modified UTF-7 + */ + +unsigned char *utf8_from_mutf7 (unsigned char *src) +{ + SIZEDTEXT utf8,utf7; + unsigned char *s; + int mbase64 = 0; + /* disallow bogus strings */ + if (mail_utf7_valid (src)) return NIL; + /* initialize SIZEDTEXTs */ + memset (&utf7,0,sizeof (SIZEDTEXT)); + memset (&utf8,0,sizeof (SIZEDTEXT)); + /* make copy of source */ + for (s = cpytxt (&utf7,src,strlen (src)); *s; ++s) switch (*s) { + case '&': /* Modified UTF-7 uses & instead of + */ + *s = '+'; + mbase64 = T; /* note that we are in Modified BASE64 */ + break; + case '+': /* temporarily swap text + to & */ + if (!mbase64) *s = '&'; + break; + case '-': /* shift back to ASCII */ + mbase64 = NIL; + break; + case ',': /* Modified UTF-7 uses , instead of / ... */ + if (mbase64) *s = '/'; /* ...in Modified BASE64 */ + break; + } + /* do the conversion */ + utf8_text_utf7 (&utf7,&utf8,NIL,NIL); + /* no longer need copy of source */ + fs_give ((void **) &utf7.data); + /* post-process: switch & and + */ + for (s = utf8.data; *s; ++s) switch (*s) { + case '&': + *s = '+'; + break; + case '+': + *s = '&'; + break; + } + return utf8.data; +}