This is the patch TAKIZAWA Takashi and I came up with in the end.

author Edmund GRIMLEY EVANS <edmundo@rano.org>

Mon, 28 Aug 2000 09:32:58 +0000 (09:32 +0000)

committer Edmund GRIMLEY EVANS <edmundo@rano.org>

Mon, 28 Aug 2000 09:32:58 +0000 (09:32 +0000)
author Edmund GRIMLEY EVANS <edmundo@rano.org>
Mon, 28 Aug 2000 09:32:58 +0000 (09:32 +0000)
committer Edmund GRIMLEY EVANS <edmundo@rano.org>
Mon, 28 Aug 2000 09:32:58 +0000 (09:32 +0000)
diff --git a/mbyte.c b/mbyte.c

index 5f14606a697e106987510f83804d60e9427aca36..ceb227281415ef7248c3d028a0d68741a1fbcb3e 100644 (file)
--- a/mbyte.c
+++ b/mbyte.c
@@ -1,3 +1,24 @@
+/*
+ * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
+ *
+ *     This program is free software; you can redistribute it and/or modify
+ *     it under the terms of the GNU General Public License as published by
+ *     the Free Software Foundation; either version 2 of the License, or
+ *     (at your option) any later version.
+ *
+ *     This program is distributed in the hope that it will be useful,
+ *     but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *     GNU General Public License for more details.
+ *
+ *     You should have received a copy of the GNU General Public License
+ *     along with this program; if not, write to the Free Software
+ *     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+ */
+
+/*
+ * Japanese support by TAKIZAWA Takashi.
+ */
  
  #include "mutt.h"
  #include "mbyte.h"
@@ -12,46 +33,185 @@
  #endif
  
  int Charset_is_utf8 = 0;
+#ifndef HAVE_WC_FUNCS
+static int charset_is_ja = 0;
+static iconv_t charset_to_utf8 = (iconv_t)(-1);
+static iconv_t charset_from_utf8 = (iconv_t)(-1);
+#endif
  
  void mutt_set_charset (char *charset)
  {
-  Charset_is_utf8 = mutt_is_utf8 (charset);
+  char buffer[8];
+
+  mutt_canonical_charset (buffer, sizeof (buffer), charset);
+
+  Charset_is_utf8 = 0;
+#ifndef HAVE_WC_FUNCS
+  charset_is_ja = 0;
+  if (charset_to_utf8 != (iconv_t)(-1))
+  {
+    iconv_close (charset_to_utf8);
+    charset_to_utf8 = (iconv_t)(-1);
+  }
+  if (charset_from_utf8 != (iconv_t)(-1))
+  {
+    iconv_close (charset_from_utf8);
+    charset_from_utf8 = (iconv_t)(-1);
+  }
+#endif
+
+  if (!strcmp(buffer, "utf-8"))
+    Charset_is_utf8 = 1;
+#ifndef HAVE_WC_FUNCS
+  else if (!strcmp(buffer, "euc-jp") || !strcmp(buffer, "shift_jis"))
+  {
+    charset_is_ja = 1;
+    charset_to_utf8 = iconv_open ("UTF-8", charset);
+    charset_from_utf8 = iconv_open (charset, "UTF-8");
+  }
+#endif
  }
  
  #ifndef HAVE_WC_FUNCS
  
-size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
-{
-  static mbstate_t mbstate;
+/*
+ * For systems that don't have them, we provide here our own
+ * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
+ * Instead of using the locale, as these functions normally would,
+ * we use Mutt's Charset variable. We support 3 types of charset:
+ * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
+ * (2) For UTF-8, wchar_t uses UCS.
+ * (3) For stateless Japanese encodings, we use UCS and convert
+ *     via UTF-8 using iconv.
+ * Unfortunately, we can't handle non-stateless encodings.
+ */
  
-  if (!ps)
-    ps = &mbstate;
+static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
+{
+  char buf[MB_LEN_MAX];
+  const char *ib;
+  char *ob;
+  size_t ibl, obl, r;
  
-  if (!s)
+  if (s)
    {
-    memset (ps, 0, sizeof (*ps));
-    return 1;
+    ibl = mutt_wctoutf8 (buf, wc);
+    if (ibl == (size_t)(-1))
+      return (size_t)(-1);
+    ib = buf;
+    ob = s;
+    obl = MB_LEN_MAX;
+    r = iconv (cd, &ib, &ibl, &ob, &obl);
    }
-  if (!wc)
+  else
    {
-    memset (ps, 0, sizeof (*ps));
-    *s = 0;
-    return 1;
+    ib = "";
+    ibl = 1;
+    ob = buf;
+    obl = sizeof (buf);
+    r = iconv (cd, &ib, &ibl, &ob, &obl);
    }
+  return ob - s;
+}
+
+size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
+{
+  /* We only handle stateless encodings, so we can ignore ps. */
+
    if (Charset_is_utf8)
      return mutt_wctoutf8 (s, wc);
-  else if (wc < 0x100)
-  {
-    *s = wc;
-    return 1;
-  }
+  else if (charset_from_utf8 != (iconv_t)(-1))
+    return wcrtomb_iconv (s, wc, charset_from_utf8);
    else
    {
+    if (!s)
+      return 1;
+    if (wc < 0x100)
+    {
+      *s = wc;
+      return 1;
+    }
      errno = EILSEQ;
      return (size_t)(-1);
    }
  }
  
+size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
+                     mbstate_t *ps, iconv_t cd)
+{
+  static mbstate_t mbstate;
+  const char *ib, *ibmax;
+  char *ob, *t;
+  size_t ibl, obl, k, r;
+  char bufi[8], bufo[6];
+
+  if (!n)
+    return (size_t)(-2);
+
+  t = memchr (ps, 0, sizeof (*ps));
+  k = t ? (t - (char *)ps) : sizeof (*ps);
+  if (k > sizeof (bufi))
+    k = 0;
+  if (k)
+  {
+    /* use the buffer for input */
+    memcpy (bufi, ps, k);
+    ib = bufi;
+    ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
+    memcpy (bufi + k, s, ibmax - bufi - k);
+  }
+  else
+  {
+    /* use the real input */
+    ib = s;
+    ibmax = s + n;
+  }
+
+  ob = bufo;
+  obl = sizeof (bufo);
+  ibl = 1;
+
+  for (;;)
+  {
+    r = iconv (cd, &ib, &ibl, &ob, &obl);
+    if (ob > bufo && (!k || ib > bufi + k))
+    {
+      /* we have a character */
+      memset (ps, 0, sizeof (*ps));
+      utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
+      return *pwc ? (ib - (k ? bufi + k : s)) : 0;
+    }
+    else if (!r || (r == (size_t)(-1) && errno == EINVAL))
+    {
+      if (ib + ibl < ibmax)
+       /* try using more input */
+       ++ibl;
+      else if (k && ib > bufi + k && bufi + k + n > ibmax)
+      {
+       /* switch to using real input */
+       ib = s + (ib - bufi - k);
+       ibmax = s + n;
+       k = 0;
+       ++ibl;
+      }
+      else
+      {
+       /* save the state and give up */
+       memset (ps, 0, sizeof (*ps));
+       if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
+         memcpy (ps, ib, ibl);
+       return (size_t)(-2);
+      }
+    }
+    else
+    {
+      /* bad input */
+      errno = EILSEQ;
+      return (size_t)(-1);
+    }
+  }
+}
+
  size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  {
    static mbstate_t mbstate;
@@ -61,6 +221,8 @@ size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  
    if (Charset_is_utf8)
      return utf8rtowc (pwc, s, n, ps);
+  else if (charset_to_utf8 != (iconv_t)(-1))
+    return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
    else
    {
      if (!s)
@@ -78,15 +240,54 @@ size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
  
  int iswprint (wint_t wc)
  {
-  if (Charset_is_utf8)
+  if (Charset_is_utf8 || charset_is_ja)
      return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
    else
      return (0 <= wc && wc < 256) ? isprint (wc) : 0;
  }
  
-#endif /* !HAVE_WC_FUNCS */
+/*
+ * l10n for Japanese:
+ *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
+ *   Character Set, have a column width of 2.
+ */
+int wcwidth_ja (wchar_t ucs)
+{
+  if (ucs >= 2e80)
+    return -1; /* continue with the normal check */
+  /* a rough range for quick check */
+  if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
+      (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
+      (ucs >= 0x2010 && ucs <= 0x266f))   /* Symbols */
+    return 2;
+  else
+    return -1;
+}
  
-#ifndef HAVE_MBYTE
+int wcwidth_ucs(wchar_t ucs);
+
+int wcwidth (wchar_t wc)
+{
+  if (!Charset_is_utf8)
+  {
+    if (!charset_is_ja)
+    {
+      /* 8-bit case */
+      if (0 <= wc && wc < 256)
+       return isprint (wc) ? 1 : -1;
+      else
+       return -1;
+    }
+    else
+    {
+      /* Japanese */
+      int k = wcwidth_ja (wc);
+      if (k != -1)
+       return k;
+    }
+  }
+  return wcwidth_ucs (wc);
+}
  
  size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
  {
@@ -175,7 +376,7 @@ size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
    return (size_t)-2;
  }
  
-#endif /* !HAVE_MBYTE */
+#endif /* !HAVE_WC_FUNCS */
  
  wchar_t replacement_char ()
  {
diff --git a/protos.h b/protos.h

index b32e2859f9293a4ee6c22a76646ca40ed5b93c09..9bb9122ab013cc40e3ed1501e57406a121c7630e 100644 (file)
--- a/protos.h
+++ b/protos.h
@@ -322,7 +322,6 @@ int mutt_from_base64 (char*, const char*);
  
  /* utf8.c */
  int mutt_wctoutf8 (char *s, unsigned int c);
-int mutt_utf8towc (unsigned int *pwc, const char *s, size_t n);
  
  #ifdef LOCALES_HACK
  #define IsPrint(c) (isprint((unsigned char)(c)) || \
diff --git a/utf8.c b/utf8.c

index 6a17c39f1c94356b12ea496cf7962b3d44021430..b65b2a9c706cf5840b8f352eb9e7d4a94a79d42f 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1,3 +1,10 @@
+#ifndef HAVE_WC_FUNCS
+
+#include <errno.h>
+
+#ifndef EILSEQ
+#define EILSEQ EINVAL
+#endif
  
  int mutt_wctoutf8 (char *s, unsigned int c)
  {
@@ -62,5 +69,8 @@ int mutt_wctoutf8 (char *s, unsigned int c)
      }
      return 6;
    }
-  return 0;
+  errno = EILSEQ;
+  return -1;
  }
+
+#endif /* !HAVE_WC_FUNCS */
diff --git a/wcwidth.c b/wcwidth.c

index dbaa9611c2f54a45dec7b5aec4b507576907876d..e0e1cb745b65e2659120abaf9062c8b5a7398f70 100644 (file)
--- a/wcwidth.c
+++ b/wcwidth.c
@@ -7,16 +7,15 @@
   */
  
  /* Adapted for Mutt by Edmund Grimley Evans.
- * wcwidth() now refers to Charset_is_utf8.
   */
  
+#ifndef HAVE_WC_FUNCS
+
  #include "mutt.h"
  #include "mbyte.h"
  
  #include <ctype.h>
  
-#ifndef HAVE_WC_FUNCS
-
  /* These functions define the column width of an ISO 10646 character
   * as follows:
   *
@@ -41,7 +40,7 @@
   * in ISO 10646.
   */
  
-int wcwidth(wchar_t ucs)
+int wcwidth_ucs(wchar_t ucs)
  {
    /* sorted list of non-overlapping intervals of non-spacing characters */
    static const struct interval {
@@ -87,14 +86,6 @@ int wcwidth(wchar_t ucs)
    if (ucs == 0)
      return 0;
  
-  /* non-UCS case */
-  if (!Charset_is_utf8) {
-    if (0 <= ucs && ucs < 256)
-      return IsPrint(ucs) ? 1 : -1;
-    else
-      return -1;
-  }
-
    /* test for 8-bit control characters */
    if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0))
      return -1;
@@ -131,7 +122,7 @@ int wcwidth(wchar_t ucs)
       (ucs >= 0xffe0 && ucs <= 0xffe6));
  }
  
-#endif /* HAVE_WCWIDTH */
+#endif /* !HAVE_WC_FUNCS */
  
  #if 0 /* original */
  int wcswidth(const wchar_t *pwcs, size_t n)
author	Edmund GRIMLEY EVANS <edmundo@rano.org>
	Mon, 28 Aug 2000 09:32:58 +0000 (09:32 +0000)
committer	Edmund GRIMLEY EVANS <edmundo@rano.org>
	Mon, 28 Aug 2000 09:32:58 +0000 (09:32 +0000)
mbyte.c		patch \| blob \| history
protos.h		patch \| blob \| history
utf8.c		patch \| blob \| history
wcwidth.c		patch \| blob \| history