From: Edmund GRIMLEY EVANS Date: Mon, 28 Aug 2000 09:32:58 +0000 (+0000) Subject: This is the patch TAKIZAWA Takashi and I came up with in the end. X-Git-Tag: mutt-1-3-8-rel~13 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=a5948f717e4f5ca6aed2cc0376638d05f51b442b;p=mutt This is the patch TAKIZAWA Takashi and I came up with in the end. When the Charset is euc-jp or shift_jis, iconv is used for mbrtowc and wcrtomb. The worst part is mbrtowc_iconv(), where I attempted to make mbrtowc both restartable (it can process part of multibyte character) and fast in the case where there is nothing left over from a previous character. Also I try to make no assumptions about how those character sets work, which is easy, because I know very little about them ... People who don't use one of those two stateless Japanese display charsets shouldn't be affected. People whose systems provide the wchar_t functions should be even less affected, because they don't even get this code in their binary. --- diff --git a/mbyte.c b/mbyte.c index 5f14606a..ceb22728 100644 --- a/mbyte.c +++ b/mbyte.c @@ -1,3 +1,24 @@ +/* + * Copyright (C) 2000 Edmund Grimley Evans + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA. + */ + +/* + * Japanese support by TAKIZAWA Takashi. + */ #include "mutt.h" #include "mbyte.h" @@ -12,46 +33,185 @@ #endif int Charset_is_utf8 = 0; +#ifndef HAVE_WC_FUNCS +static int charset_is_ja = 0; +static iconv_t charset_to_utf8 = (iconv_t)(-1); +static iconv_t charset_from_utf8 = (iconv_t)(-1); +#endif void mutt_set_charset (char *charset) { - Charset_is_utf8 = mutt_is_utf8 (charset); + char buffer[8]; + + mutt_canonical_charset (buffer, sizeof (buffer), charset); + + Charset_is_utf8 = 0; +#ifndef HAVE_WC_FUNCS + charset_is_ja = 0; + if (charset_to_utf8 != (iconv_t)(-1)) + { + iconv_close (charset_to_utf8); + charset_to_utf8 = (iconv_t)(-1); + } + if (charset_from_utf8 != (iconv_t)(-1)) + { + iconv_close (charset_from_utf8); + charset_from_utf8 = (iconv_t)(-1); + } +#endif + + if (!strcmp(buffer, "utf-8")) + Charset_is_utf8 = 1; +#ifndef HAVE_WC_FUNCS + else if (!strcmp(buffer, "euc-jp") || !strcmp(buffer, "shift_jis")) + { + charset_is_ja = 1; + charset_to_utf8 = iconv_open ("UTF-8", charset); + charset_from_utf8 = iconv_open (charset, "UTF-8"); + } +#endif } #ifndef HAVE_WC_FUNCS -size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps) -{ - static mbstate_t mbstate; +/* + * For systems that don't have them, we provide here our own + * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth(). + * Instead of using the locale, as these functions normally would, + * we use Mutt's Charset variable. We support 3 types of charset: + * (1) For 8-bit charsets, wchar_t uses the same encoding as char. + * (2) For UTF-8, wchar_t uses UCS. + * (3) For stateless Japanese encodings, we use UCS and convert + * via UTF-8 using iconv. + * Unfortunately, we can't handle non-stateless encodings. + */ - if (!ps) - ps = &mbstate; +static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd) +{ + char buf[MB_LEN_MAX]; + const char *ib; + char *ob; + size_t ibl, obl, r; - if (!s) + if (s) { - memset (ps, 0, sizeof (*ps)); - return 1; + ibl = mutt_wctoutf8 (buf, wc); + if (ibl == (size_t)(-1)) + return (size_t)(-1); + ib = buf; + ob = s; + obl = MB_LEN_MAX; + r = iconv (cd, &ib, &ibl, &ob, &obl); } - if (!wc) + else { - memset (ps, 0, sizeof (*ps)); - *s = 0; - return 1; + ib = ""; + ibl = 1; + ob = buf; + obl = sizeof (buf); + r = iconv (cd, &ib, &ibl, &ob, &obl); } + return ob - s; +} + +size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps) +{ + /* We only handle stateless encodings, so we can ignore ps. */ + if (Charset_is_utf8) return mutt_wctoutf8 (s, wc); - else if (wc < 0x100) - { - *s = wc; - return 1; - } + else if (charset_from_utf8 != (iconv_t)(-1)) + return wcrtomb_iconv (s, wc, charset_from_utf8); else { + if (!s) + return 1; + if (wc < 0x100) + { + *s = wc; + return 1; + } errno = EILSEQ; return (size_t)(-1); } } +size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n, + mbstate_t *ps, iconv_t cd) +{ + static mbstate_t mbstate; + const char *ib, *ibmax; + char *ob, *t; + size_t ibl, obl, k, r; + char bufi[8], bufo[6]; + + if (!n) + return (size_t)(-2); + + t = memchr (ps, 0, sizeof (*ps)); + k = t ? (t - (char *)ps) : sizeof (*ps); + if (k > sizeof (bufi)) + k = 0; + if (k) + { + /* use the buffer for input */ + memcpy (bufi, ps, k); + ib = bufi; + ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi)); + memcpy (bufi + k, s, ibmax - bufi - k); + } + else + { + /* use the real input */ + ib = s; + ibmax = s + n; + } + + ob = bufo; + obl = sizeof (bufo); + ibl = 1; + + for (;;) + { + r = iconv (cd, &ib, &ibl, &ob, &obl); + if (ob > bufo && (!k || ib > bufi + k)) + { + /* we have a character */ + memset (ps, 0, sizeof (*ps)); + utf8rtowc (pwc, bufo, ob - bufo, &mbstate); + return *pwc ? (ib - (k ? bufi + k : s)) : 0; + } + else if (!r || (r == (size_t)(-1) && errno == EINVAL)) + { + if (ib + ibl < ibmax) + /* try using more input */ + ++ibl; + else if (k && ib > bufi + k && bufi + k + n > ibmax) + { + /* switch to using real input */ + ib = s + (ib - bufi - k); + ibmax = s + n; + k = 0; + ++ibl; + } + else + { + /* save the state and give up */ + memset (ps, 0, sizeof (*ps)); + if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */ + memcpy (ps, ib, ibl); + return (size_t)(-2); + } + } + else + { + /* bad input */ + errno = EILSEQ; + return (size_t)(-1); + } + } +} + size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) { static mbstate_t mbstate; @@ -61,6 +221,8 @@ size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) if (Charset_is_utf8) return utf8rtowc (pwc, s, n, ps); + else if (charset_to_utf8 != (iconv_t)(-1)) + return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8); else { if (!s) @@ -78,15 +240,54 @@ size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) int iswprint (wint_t wc) { - if (Charset_is_utf8) + if (Charset_is_utf8 || charset_is_ja) return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc); else return (0 <= wc && wc < 256) ? isprint (wc) : 0; } -#endif /* !HAVE_WC_FUNCS */ +/* + * l10n for Japanese: + * Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji + * Character Set, have a column width of 2. + */ +int wcwidth_ja (wchar_t ucs) +{ + if (ucs >= 2e80) + return -1; /* continue with the normal check */ + /* a rough range for quick check */ + if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */ + (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */ + (ucs >= 0x2010 && ucs <= 0x266f)) /* Symbols */ + return 2; + else + return -1; +} -#ifndef HAVE_MBYTE +int wcwidth_ucs(wchar_t ucs); + +int wcwidth (wchar_t wc) +{ + if (!Charset_is_utf8) + { + if (!charset_is_ja) + { + /* 8-bit case */ + if (0 <= wc && wc < 256) + return isprint (wc) ? 1 : -1; + else + return -1; + } + else + { + /* Japanese */ + int k = wcwidth_ja (wc); + if (k != -1) + return k; + } + } + return wcwidth_ucs (wc); +} size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps) { @@ -175,7 +376,7 @@ size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps) return (size_t)-2; } -#endif /* !HAVE_MBYTE */ +#endif /* !HAVE_WC_FUNCS */ wchar_t replacement_char () { diff --git a/protos.h b/protos.h index b32e2859..9bb9122a 100644 --- a/protos.h +++ b/protos.h @@ -322,7 +322,6 @@ int mutt_from_base64 (char*, const char*); /* utf8.c */ int mutt_wctoutf8 (char *s, unsigned int c); -int mutt_utf8towc (unsigned int *pwc, const char *s, size_t n); #ifdef LOCALES_HACK #define IsPrint(c) (isprint((unsigned char)(c)) || \ diff --git a/utf8.c b/utf8.c index 6a17c39f..b65b2a9c 100644 --- a/utf8.c +++ b/utf8.c @@ -1,3 +1,10 @@ +#ifndef HAVE_WC_FUNCS + +#include + +#ifndef EILSEQ +#define EILSEQ EINVAL +#endif int mutt_wctoutf8 (char *s, unsigned int c) { @@ -62,5 +69,8 @@ int mutt_wctoutf8 (char *s, unsigned int c) } return 6; } - return 0; + errno = EILSEQ; + return -1; } + +#endif /* !HAVE_WC_FUNCS */ diff --git a/wcwidth.c b/wcwidth.c index dbaa9611..e0e1cb74 100644 --- a/wcwidth.c +++ b/wcwidth.c @@ -7,16 +7,15 @@ */ /* Adapted for Mutt by Edmund Grimley Evans. - * wcwidth() now refers to Charset_is_utf8. */ +#ifndef HAVE_WC_FUNCS + #include "mutt.h" #include "mbyte.h" #include -#ifndef HAVE_WC_FUNCS - /* These functions define the column width of an ISO 10646 character * as follows: * @@ -41,7 +40,7 @@ * in ISO 10646. */ -int wcwidth(wchar_t ucs) +int wcwidth_ucs(wchar_t ucs) { /* sorted list of non-overlapping intervals of non-spacing characters */ static const struct interval { @@ -87,14 +86,6 @@ int wcwidth(wchar_t ucs) if (ucs == 0) return 0; - /* non-UCS case */ - if (!Charset_is_utf8) { - if (0 <= ucs && ucs < 256) - return IsPrint(ucs) ? 1 : -1; - else - return -1; - } - /* test for 8-bit control characters */ if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) return -1; @@ -131,7 +122,7 @@ int wcwidth(wchar_t ucs) (ucs >= 0xffe0 && ucs <= 0xffe6)); } -#endif /* HAVE_WCWIDTH */ +#endif /* !HAVE_WC_FUNCS */ #if 0 /* original */ int wcswidth(const wchar_t *pwcs, size_t n)