From aad4940d4b7ddfbc07f50d2a8ca311d10db8ade7 Mon Sep 17 00:00:00 2001 From: Pietro Cerutti Date: Fri, 28 Jul 2017 13:43:52 +0000 Subject: [PATCH] Kill bundled wchar_t support, rely on the system's Issue #690 --- Makefile.am | 2 +- configure.ac | 30 ++-- mbyte.c | 475 ------------------------------------------------- mbyte.h | 34 ---- mutt.h | 7 - po/POTFILES.in | 2 - utf8.c | 94 ---------- version.c | 5 - wcwidth.c | 162 ----------------- 9 files changed, 13 insertions(+), 798 deletions(-) delete mode 100644 utf8.c delete mode 100644 wcwidth.c diff --git a/Makefile.am b/Makefile.am index d6cf7f7dc..081ceded8 100644 --- a/Makefile.am +++ b/Makefile.am @@ -80,7 +80,7 @@ AM_CPPFLAGS=-I. -I$(top_srcdir) $(GPGME_CFLAGS) EXTRA_mutt_SOURCES = browser.h mbyte.h mutt_idna.c mutt_idna.h \ mutt_lua.c mutt_sasl.c mutt_notmuch.c mutt_ssl.c mutt_ssl_gnutls.c \ - remailer.c remailer.h resize.c url.h utf8.c wcwidth.c + remailer.c remailer.h resize.c url.h EXTRA_DIST = account.h attach.h bcache.h browser.h buffy.h \ ChangeLog.md charset.h CODE_OF_CONDUCT.md compress.h copy.h \ diff --git a/configure.ac b/configure.ac index d302cfcc8..7b879c274 100644 --- a/configure.ac +++ b/configure.ac @@ -882,28 +882,22 @@ if test "x$with_idn" != "xno"; then fi dnl -- locales -- - -wc_funcs=maybe -AC_ARG_WITH(wc-funcs, AS_HELP_STRING([--without-wc-funcs],[Do not use the system's wchar_t functions]), - wc_funcs=$withval) - -if test "$wc_funcs" != yes && test "$wc_funcs" != no; then - AC_CACHE_CHECK([for wchar_t functions], mutt_cv_wc_funcs, - mutt_cv_wc_funcs=no - AC_LINK_IFELSE([AC_LANG_PROGRAM([[ +AC_CACHE_CHECK([for wchar_t functions], mutt_cv_wc_funcs, + mutt_cv_wc_funcs=no + AC_LINK_IFELSE([AC_LANG_PROGRAM([[ #define _XOPEN_SOURCE 600 #include #include #include -#include ]], [[mbrtowc(0, 0, 0, 0); wctomb(0, 0); wcwidth(0); - iswprint(0); iswspace(0); towlower(0); towupper(0); iswalnum(0)]])],[mutt_cv_wc_funcs=yes],[])) - wc_funcs=$mutt_cv_wc_funcs -fi - -if test $wc_funcs = yes; then - AC_DEFINE(HAVE_WC_FUNCS,1,[ Define if you are using the system's wchar_t functions. ]) -else - MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS utf8.o wcwidth.o" +#include ]], +[[ +mbrtowc(0, 0, 0, 0);wctomb(0, 0); wcwidth(0); +iswprint(0); iswspace(0); towlower(0); +towupper(0); iswalnum(0)]] +)],[mutt_cv_wc_funcs=yes],[])) + +if test $mutt_cv_wc_funcs != yes; then + AC_MSG_ERROR([wchar_t functions not found]) fi # Only enable fmemopen if both fmemopen() and open_memstream() diff --git a/mbyte.c b/mbyte.c index 0a3e1d621..5b427ff14 100644 --- a/mbyte.c +++ b/mbyte.c @@ -41,11 +41,6 @@ #endif int Charset_is_utf8 = 0; -#ifndef HAVE_WC_FUNCS -static int charset_is_ja = 0; -static iconv_t charset_to_utf8 = (iconv_t)(-1); -static iconv_t charset_from_utf8 = (iconv_t)(-1); -#endif void mutt_set_charset(char *charset) { @@ -54,485 +49,15 @@ void mutt_set_charset(char *charset) mutt_canonical_charset(buffer, sizeof(buffer), charset); Charset_is_utf8 = 0; -#ifndef HAVE_WC_FUNCS - charset_is_ja = 0; - if (charset_to_utf8 != (iconv_t)(-1)) - { - iconv_close(charset_to_utf8); - charset_to_utf8 = (iconv_t)(-1); - } - if (charset_from_utf8 != (iconv_t)(-1)) - { - iconv_close(charset_from_utf8); - charset_from_utf8 = (iconv_t)(-1); - } -#endif if (mutt_is_utf8(buffer)) Charset_is_utf8 = 1; -#ifndef HAVE_WC_FUNCS - else if ((ascii_strcasecmp(buffer, "euc-jp") == 0) || - (ascii_strcasecmp(buffer, "shift_jis") == 0) || - (ascii_strcasecmp(buffer, "cp932") == 0) || - (ascii_strcasecmp(buffer, "eucJP-ms") == 0)) - { - charset_is_ja = 1; - - /* Note flags=0 to skip charset-hooks: User masters the $charset - * name, and we are sure of our "utf-8" constant. So there is no - * possibility of wrong name that we would want to try to correct - * with a charset-hook. Or rather: If $charset was wrong, we would - * want to try to correct... $charset directly. - */ - charset_to_utf8 = mutt_iconv_open("utf-8", charset, 0); - charset_from_utf8 = mutt_iconv_open(charset, "utf-8", 0); - } -#endif #if defined(HAVE_BIND_TEXTDOMAIN_CODESET) && defined(ENABLE_NLS) bind_textdomain_codeset(PACKAGE, buffer); #endif } -#ifndef HAVE_WC_FUNCS - -static size_t utf8rtowc(wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps) -{ - static wchar_t mbstate; - wchar_t *ps = (wchar_t *) _ps; - size_t k = 1; - unsigned char c; - wchar_t wc; - int count; - - if (!ps) - ps = &mbstate; - - if (!s) - { - *ps = 0; - return 0; - } - if (!n) - return (size_t) -2; - - if (!*ps) - { - c = (unsigned char) *s; - if (c < 0x80) - { - if (pwc) - *pwc = c; - return (c != 0); - } - else if (c < 0xc2) - { - errno = EILSEQ; - return (size_t) -1; - } - else if (c < 0xe0) - wc = ((c & 0x1f) << 6) + (count = 0); - else if (c < 0xf0) - wc = ((c & 0x0f) << 12) + (count = 1); - else if (c < 0xf8) - wc = ((c & 0x07) << 18) + (count = 2); - else if (c < 0xfc) - wc = ((c & 0x03) << 24) + (count = 3); - else if (c < 0xfe) - wc = ((c & 0x01) << 30) + (count = 4); - else - { - errno = EILSEQ; - return (size_t) -1; - } - s++; - n--; - k++; - } - else - { - wc = *ps & 0x7fffffff; - count = wc & 7; /* if count > 4 it will be caught below */ - } - - for (; n; ++s, --n, ++k) - { - c = (unsigned char) *s; - if (0x80 <= c && c < 0xc0) - { - wc |= (c & 0x3f) << (6 * count); - if (!count) - { - if (pwc) - *pwc = wc; - *ps = 0; - return wc ? k : 0; - } - count--; - wc--; - if (!(wc >> (11 + count * 5))) - { - errno = count < 4 ? EILSEQ : EINVAL; - return (size_t) -1; - } - } - else - { - errno = EILSEQ; - return (size_t) -1; - } - } - *ps = wc; - return (size_t) -2; -} - -/** - * wcrtomb_iconv - Convert wide characters to multibyte characters - * - * For systems that don't have them, we provide here our own implementations of - * wcrtomb(), mbrtowc(), iswprint() and wcwidth(). Instead of using the - * locale, as these functions normally would, we use Mutt's Charset variable. - * We support 3 types of charset: - * 1. For 8-bit charsets, wchar_t uses the same encoding as char. - * 2. For UTF-8, wchar_t uses UCS. - * 3. For stateless Japanese encodings, we use UCS and convert via UTF-8 using - * iconv. - * Unfortunately, we can't handle non-stateless encodings. - */ -static size_t wcrtomb_iconv(char *s, wchar_t wc, iconv_t cd) -{ - char buf[MB_LEN_MAX + 1]; - ICONV_CONST char *ib = NULL; - char *ob = NULL; - size_t ibl, obl; - - if (s) - { - ibl = mutt_wctoutf8(buf, wc, sizeof(buf)); - if (ibl == (size_t)(-1)) - return (size_t)(-1); - ib = buf; - ob = s; - obl = MB_LEN_MAX; - iconv(cd, &ib, &ibl, &ob, &obl); - } - else - { - ib = ""; - ibl = 1; - ob = buf; - obl = sizeof(buf); - iconv(cd, &ib, &ibl, &ob, &obl); - } - return ob - s; -} - -size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps) -{ - /* We only handle stateless encodings, so we can ignore ps. */ - - if (Charset_is_utf8) - return mutt_wctoutf8(s, wc, MB_LEN_MAX); - else if (charset_from_utf8 != (iconv_t)(-1)) - return wcrtomb_iconv(s, wc, charset_from_utf8); - else - { - if (!s) - return 1; - if (wc < 0x100) - { - *s = wc; - return 1; - } - errno = EILSEQ; - return (size_t)(-1); - } -} - -static size_t utf8rtowc(wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps); - -static size_t mbrtowc_iconv(wchar_t *pwc, const char *s, size_t n, mbstate_t *ps, iconv_t cd) -{ - static mbstate_t mbstate; - ICONV_CONST char *ib = NULL, *ibmax = NULL; - char *ob = NULL, *t = NULL; - size_t ibl, obl, k, r; - char bufi[8], bufo[6]; - - if (!n) - return (size_t)(-2); - - t = memchr(ps, 0, sizeof(*ps)); - k = t ? (t - (char *) ps) : sizeof(*ps); - if (k > sizeof(bufi)) - k = 0; - if (k) - { - /* use the buffer for input */ - memcpy(bufi, ps, k); - ib = bufi; - ibmax = bufi + (k + n < sizeof(bufi) ? k + n : sizeof(bufi)); - memcpy(bufi + k, s, ibmax - bufi - k); - } - else - { - /* use the real input */ - ib = (ICONV_CONST char *) s; - ibmax = (ICONV_CONST char *) s + n; - } - - ob = bufo; - obl = sizeof(bufo); - ibl = 1; - - for (;;) - { - r = iconv(cd, &ib, &ibl, &ob, &obl); - if (ob > bufo && (!k || ib > bufi + k)) - { - /* we have a character */ - memset(ps, 0, sizeof(*ps)); - utf8rtowc(pwc, bufo, ob - bufo, &mbstate); - return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0; - } - else if (!r || (r == (size_t)(-1) && errno == EINVAL)) - { - if (ib + ibl < ibmax) - /* try using more input */ - ibl++; - else if (k && ib > bufi + k && bufi + k + n > ibmax) - { - /* switch to using real input */ - ib = (ICONV_CONST char *) s + (ib - bufi - k); - ibmax = (ICONV_CONST char *) s + n; - k = 0; - ibl++; - } - else - { - /* save the state and give up */ - memset(ps, 0, sizeof(*ps)); - if (ibl <= sizeof(mbstate_t)) /* need extra condition here! */ - memcpy(ps, ib, ibl); - return (size_t)(-2); - } - } - else - { - /* bad input */ - errno = EILSEQ; - return (size_t)(-1); - } - } -} - -size_t mbrtowc(wchar_t *pwc, const char *s, size_t n, mbstate_t *ps) -{ - static mbstate_t mbstate; - - if (!ps) - ps = &mbstate; - - if (Charset_is_utf8) - return utf8rtowc(pwc, s, n, ps); - else if (charset_to_utf8 != (iconv_t)(-1)) - return mbrtowc_iconv(pwc, s, n, ps, charset_to_utf8); - else - { - if (!s) - { - memset(ps, 0, sizeof(*ps)); - return 0; - } - if (!n) - return (size_t) -2; - if (pwc) - *pwc = (wchar_t)(unsigned char) *s; - return (*s != 0); - } -} - -int iswprint(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc); - else - return (0 <= wc && wc < 256) ? IsPrint(wc) : 0; -} - -int iswspace(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return (9 <= wc && wc <= 13) || wc == 32; - else - return (0 <= wc && wc < 256) ? isspace(wc) : 0; -} - -static wint_t towupper_ucs(wint_t x) -{ - /* Only works for x < 0x130 */ - if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7)) - return x - 32; - else if (0x100 <= x && x < 0x130) - return x & ~1; - else if (x == 0xb5) - return 0x39c; - else if (x == 0xff) - return 0x178; - else - return x; -} - -static int iswupper_ucs(wint_t x) -{ - /* Only works for x < 0x130 */ - if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7)) - return 0; - else if ((0x40 < x && x < 0x5b) || (0xbf < x && x < 0xde)) - return 1; - else if (0x100 <= x && x < 0x130) - return 1; - else if (x == 0xb5) - return 1; - else if (x == 0xff) - return 0; - else - return 0; -} - -static wint_t towlower_ucs(wint_t x) -{ - /* Only works for x < 0x130 */ - if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7)) - return x + 32; - else if (0x100 <= x && x < 0x130) - return x | 1; - else - return x; -} - -static int iswalnum_ucs(wint_t wc) -{ - /* Only works for x < 0x220 */ - if (wc >= 0x100) - return 1; - else if (wc < 0x30) - return 0; - else if (wc < 0x3a) - return 1; - else if (wc < 0xa0) - return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b); - else if (wc < 0xc0) - return (wc == 0xaa || wc == 0xb5 || wc == 0xba); - else - return !(wc == 0xd7 || wc == 0xf7); -} - -static int iswalpha_ucs(wint_t wc) -{ - /* Only works for x < 0x220 */ - if (wc >= 0x100) - return 1; - else if (wc < 0x3a) - return 0; - else if (wc < 0xa0) - return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b); - else if (wc < 0xc0) - return (wc == 0xaa || wc == 0xb5 || wc == 0xba); - else - return !(wc == 0xd7 || wc == 0xf7); -} - -wint_t towupper(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return towupper_ucs(wc); - else - return (0 <= wc && wc < 256) ? toupper(wc) : wc; -} - -wint_t towlower(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return towlower_ucs(wc); - else - return (0 <= wc && wc < 256) ? tolower(wc) : wc; -} - -int iswalnum(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return iswalnum_ucs(wc); - else - return (0 <= wc && wc < 256) ? isalnum(wc) : 0; -} - -int iswalpha(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return iswalpha_ucs(wc); - else - return (0 <= wc && wc < 256) ? isalpha(wc) : 0; -} - -int iswupper(wint_t wc) -{ - if (Charset_is_utf8 || charset_is_ja) - return iswupper_ucs(wc); - else - return (0 <= wc && wc < 256) ? isupper(wc) : 0; -} - -/** - * wcwidth_ja - Calculate character widths for Japanese - * - * L10N for Japanese: - * Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji - * Character Set, have a column width of 2. - */ -static int wcwidth_ja(wchar_t ucs) -{ - if (ucs >= 0x3021) - return -1; /* continue with the normal check */ - /* a rough range for quick check */ - if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */ - (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */ - (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */ - (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */ - return 2; - else - return -1; -} - -int wcwidth_ucs(wchar_t ucs); - -int wcwidth(wchar_t wc) -{ - if (!Charset_is_utf8) - { - if (!charset_is_ja) - { - /* 8-bit case */ - if (!wc) - return 0; - else if ((0 <= wc && wc < 256) && IsPrint(wc)) - return 1; - else - return -1; - } - else - { - /* Japanese */ - int k = wcwidth_ja(wc); - if (k != -1) - return k; - } - } - return wcwidth_ucs(wc); -} - -#endif /* !HAVE_WC_FUNCS */ - wchar_t replacement_char(void) { return Charset_is_utf8 ? 0xfffd : '?'; diff --git a/mbyte.h b/mbyte.h index a72748b9d..12e45e3f1 100644 --- a/mbyte.h +++ b/mbyte.h @@ -25,40 +25,6 @@ #include #include -#ifndef HAVE_WC_FUNCS -#ifdef towupper -#undef towupper -#endif -#ifdef towlower -#undef towlower -#endif -#ifdef iswprint -#undef iswprint -#endif -#ifdef iswspace -#undef iswspace -#endif -#ifdef iswalnum -#undef iswalnum -#endif -#ifdef iswalpha -#undef iswalpha -#endif -#ifdef iswupper -#undef iswupper -#endif -size_t wcrtomb(char *s, wchar_t wc, mbstate_t *ps); -size_t mbrtowc(wchar_t *pwc, const char *s, size_t n, mbstate_t *ps); -int iswprint(wint_t wc); -int iswspace(wint_t wc); -int iswalnum(wint_t wc); -int iswalpha(wint_t wc); -int iswupper(wint_t wc); -wint_t towupper(wint_t wc); -wint_t towlower(wint_t wc); -int wcwidth(wchar_t wc); -#endif /* !HAVE_WC_FUNCS */ - void mutt_set_charset(char *charset); extern int Charset_is_utf8; wchar_t replacement_char(void); diff --git a/mutt.h b/mutt.h index 683604b6a..cfef4a578 100644 --- a/mutt.h +++ b/mutt.h @@ -47,13 +47,6 @@ struct State; #define PATH_MAX _POSIX_PATH_MAX #endif -#ifndef HAVE_WC_FUNCS -#ifdef MB_LEN_MAX -#undef MB_LEN_MAX -#endif -#define MB_LEN_MAX 16 -#endif - #ifdef HAVE_FGETS_UNLOCKED #define fgets fgets_unlocked #endif diff --git a/po/POTFILES.in b/po/POTFILES.in index 630dd9cd2..bce6415a2 100644 --- a/po/POTFILES.in +++ b/po/POTFILES.in @@ -134,7 +134,5 @@ system.c thread.c txt2c.c url.c -utf8.c version.c wcscasecmp.c -wcwidth.c diff --git a/utf8.c b/utf8.c deleted file mode 100644 index cb00fadee..000000000 --- a/utf8.c +++ /dev/null @@ -1,94 +0,0 @@ -/** - * @file - * For systems lacking wide character functions - * - * @authors - * @copyright - * This program is free software: you can redistribute it and/or modify it under - * the terms of the GNU General Public License as published by the Free Software - * Foundation, either version 2 of the License, or (at your option) any later - * version. - * - * This program is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS - * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License along with - * this program. If not, see . - */ - -#include "config.h" -#include -#include - -#ifndef EILSEQ -#define EILSEQ EINVAL -#endif - -int mutt_wctoutf8(char *s, unsigned int c, size_t buflen) -{ - if (c < (1 << 7)) - { - if (s && buflen >= 1) - *s++ = c; - return 1; - } - else if (c < (1 << 11)) - { - if (s && buflen >= 2) - { - *s++ = 0xc0 | (c >> 6); - *s++ = 0x80 | (c & 0x3f); - } - return 2; - } - else if (c < (1 << 16)) - { - if (s && buflen >= 3) - { - *s++ = 0xe0 | (c >> 12); - *s++ = 0x80 | ((c >> 6) & 0x3f); - *s++ = 0x80 | (c & 0x3f); - } - return 3; - } - else if (c < (1 << 21)) - { - if (s && buflen >= 4) - { - *s++ = 0xf0 | (c >> 18); - *s++ = 0x80 | ((c >> 12) & 0x3f); - *s++ = 0x80 | ((c >> 6) & 0x3f); - *s++ = 0x80 | (c & 0x3f); - } - return 4; - } - else if (c < (1 << 26)) - { - if (s && buflen >= 5) - { - *s++ = 0xf8 | (c >> 24); - *s++ = 0x80 | ((c >> 18) & 0x3f); - *s++ = 0x80 | ((c >> 12) & 0x3f); - *s++ = 0x80 | ((c >> 6) & 0x3f); - *s++ = 0x80 | (c & 0x3f); - } - return 5; - } - else if (c < (1 << 31)) - { - if (s && buflen >= 6) - { - *s++ = 0xfc | (c >> 30); - *s++ = 0x80 | ((c >> 24) & 0x3f); - *s++ = 0x80 | ((c >> 18) & 0x3f); - *s++ = 0x80 | ((c >> 12) & 0x3f); - *s++ = 0x80 | ((c >> 6) & 0x3f); - *s++ = 0x80 | (c & 0x3f); - } - return 6; - } - errno = EILSEQ; - return -1; -} diff --git a/version.c b/version.c index a66711313..3e7141869 100644 --- a/version.c +++ b/version.c @@ -285,11 +285,6 @@ static struct CompileOptions comp_opts[] = { { "typeahead", 1 }, #else { "typeahead", 0 }, -#endif -#ifdef HAVE_WC_FUNCS - { "wc_funcs", 1 }, -#else - { "wc_funcs", 0 }, #endif { NULL, 0 }, }; diff --git a/wcwidth.c b/wcwidth.c deleted file mode 100644 index 693ed06ab..000000000 --- a/wcwidth.c +++ /dev/null @@ -1,162 +0,0 @@ -/** - * @file - * For systems lacking wide character functions - * - * This is an implementation of wcwidth() and wcswidth() (defined in - * IEEE Std 1002.1-2001) for Unicode. - * - * http://www.opengroup.org/onlinepubs/007904975/functions/wcwidth.html - * http://www.opengroup.org/onlinepubs/007904975/functions/wcswidth.html - * - * Markus Kuhn -- 2007-05-26 (Unicode 5.0) - * - * Permission to use, copy, modify, and distribute this software - * for any purpose and without fee is hereby granted. The author - * disclaims all warranties with regard to this software. - * - * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c - */ - -/* Changes made for mutt: - * - Adapted for Mutt by Edmund Grimley Evans. - * - Changed 'first'/'last' members of combined[] to wchar_t from - * unsigned short to fix compiler warnings, 2007-11-13, Rocco Rutte - */ - -#include "config.h" -#include - -/* The following two functions define the column width of an ISO 10646 - * character as follows: - * - * - The null character (U+0000) has a column width of 0. - * - * - Other C0/C1 control characters and DEL will lead to a return - * value of -1. - * - * - Non-spacing and enclosing combining characters (general - * category code Mn or Me in the Unicode database) have a - * column width of 0. - * - * - SOFT HYPHEN (U+00AD) has a column width of 1. - * - * - Other format characters (general category code Cf in the Unicode - * database) and ZERO WIDTH SPACE (U+200B) have a column width of 0. - * - * - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF) - * have a column width of 0. - * - * - Spacing characters in the East Asian Wide (W) or East Asian - * Full-width (F) category as defined in Unicode Technical - * Report #11 have a column width of 2. - * - * - All remaining characters (including all printable - * ISO 8859-1 and WGL4 characters, Unicode control characters, - * etc.) have a column width of 1. - * - * This implementation assumes that wchar_t characters are encoded - * in ISO 10646. - */ - -int wcwidth_ucs(wchar_t ucs) -{ - /* sorted list of non-overlapping intervals of non-spacing characters */ - /* generated by "uniset +cat=Me +cat=Mn +cat=Cf -00AD +1160-11FF +200B c" */ - static const struct interval - { - wchar_t first; - wchar_t last; - } combining[] = { - { 0x0300, 0x036f }, { 0x0483, 0x0486 }, { 0x0488, 0x0489 }, - { 0x0591, 0x05bd }, { 0x05bf, 0x05bf }, { 0x05c1, 0x05c2 }, - { 0x05c4, 0x05c5 }, { 0x05c7, 0x05c7 }, { 0x0600, 0x0603 }, - { 0x0610, 0x0615 }, { 0x064b, 0x065e }, { 0x0670, 0x0670 }, - { 0x06d6, 0x06e4 }, { 0x06e7, 0x06e8 }, { 0x06ea, 0x06ed }, - { 0x070f, 0x070f }, { 0x0711, 0x0711 }, { 0x0730, 0x074a }, - { 0x07a6, 0x07b0 }, { 0x07eb, 0x07f3 }, { 0x0901, 0x0902 }, - { 0x093c, 0x093c }, { 0x0941, 0x0948 }, { 0x094d, 0x094d }, - { 0x0951, 0x0954 }, { 0x0962, 0x0963 }, { 0x0981, 0x0981 }, - { 0x09bc, 0x09bc }, { 0x09c1, 0x09c4 }, { 0x09cd, 0x09cd }, - { 0x09e2, 0x09e3 }, { 0x0a01, 0x0a02 }, { 0x0a3c, 0x0a3c }, - { 0x0a41, 0x0a42 }, { 0x0a47, 0x0a48 }, { 0x0a4b, 0x0a4d }, - { 0x0a70, 0x0a71 }, { 0x0a81, 0x0a82 }, { 0x0abc, 0x0abc }, - { 0x0ac1, 0x0ac5 }, { 0x0ac7, 0x0ac8 }, { 0x0acd, 0x0acd }, - { 0x0ae2, 0x0ae3 }, { 0x0b01, 0x0b01 }, { 0x0b3c, 0x0b3c }, - { 0x0b3f, 0x0b3f }, { 0x0b41, 0x0b43 }, { 0x0b4d, 0x0b4d }, - { 0x0b56, 0x0b56 }, { 0x0b82, 0x0b82 }, { 0x0bc0, 0x0bc0 }, - { 0x0bcd, 0x0bcd }, { 0x0c3e, 0x0c40 }, { 0x0c46, 0x0c48 }, - { 0x0c4a, 0x0c4d }, { 0x0c55, 0x0c56 }, { 0x0cbc, 0x0cbc }, - { 0x0cbf, 0x0cbf }, { 0x0cc6, 0x0cc6 }, { 0x0ccc, 0x0ccd }, - { 0x0ce2, 0x0ce3 }, { 0x0d41, 0x0d43 }, { 0x0d4d, 0x0d4d }, - { 0x0dca, 0x0dca }, { 0x0dd2, 0x0dd4 }, { 0x0dd6, 0x0dd6 }, - { 0x0e31, 0x0e31 }, { 0x0e34, 0x0e3a }, { 0x0e47, 0x0e4e }, - { 0x0eb1, 0x0eb1 }, { 0x0eb4, 0x0eb9 }, { 0x0ebb, 0x0ebc }, - { 0x0ec8, 0x0ecd }, { 0x0f18, 0x0f19 }, { 0x0f35, 0x0f35 }, - { 0x0f37, 0x0f37 }, { 0x0f39, 0x0f39 }, { 0x0f71, 0x0f7e }, - { 0x0f80, 0x0f84 }, { 0x0f86, 0x0f87 }, { 0x0f90, 0x0f97 }, - { 0x0f99, 0x0fbc }, { 0x0fc6, 0x0fc6 }, { 0x102d, 0x1030 }, - { 0x1032, 0x1032 }, { 0x1036, 0x1037 }, { 0x1039, 0x1039 }, - { 0x1058, 0x1059 }, { 0x1160, 0x11ff }, { 0x135f, 0x135f }, - { 0x1712, 0x1714 }, { 0x1732, 0x1734 }, { 0x1752, 0x1753 }, - { 0x1772, 0x1773 }, { 0x17b4, 0x17b5 }, { 0x17b7, 0x17bd }, - { 0x17c6, 0x17c6 }, { 0x17c9, 0x17d3 }, { 0x17dd, 0x17dd }, - { 0x180b, 0x180d }, { 0x18a9, 0x18a9 }, { 0x1920, 0x1922 }, - { 0x1927, 0x1928 }, { 0x1932, 0x1932 }, { 0x1939, 0x193b }, - { 0x1a17, 0x1a18 }, { 0x1b00, 0x1b03 }, { 0x1b34, 0x1b34 }, - { 0x1b36, 0x1b3a }, { 0x1b3c, 0x1b3c }, { 0x1b42, 0x1b42 }, - { 0x1b6b, 0x1b73 }, { 0x1dc0, 0x1dca }, { 0x1dfe, 0x1dff }, - { 0x200b, 0x200f }, { 0x202a, 0x202e }, { 0x2060, 0x2063 }, - { 0x206a, 0x206f }, { 0x20d0, 0x20ef }, { 0x302a, 0x302f }, - { 0x3099, 0x309a }, { 0xa806, 0xa806 }, { 0xa80b, 0xa80b }, - { 0xa825, 0xa826 }, { 0xfb1e, 0xfb1e }, { 0xfe00, 0xfe0f }, - { 0xfe20, 0xfe23 }, { 0xfeff, 0xfeff }, { 0xfff9, 0xfffb }, - { 0x10a01, 0x10a03 }, { 0x10a05, 0x10a06 }, { 0x10a0c, 0x10a0f }, - { 0x10a38, 0x10a3a }, { 0x10a3f, 0x10a3f }, { 0x1d167, 0x1d169 }, - { 0x1d173, 0x1d182 }, { 0x1d185, 0x1d18b }, { 0x1d1aa, 0x1d1ad }, - { 0x1d242, 0x1d244 }, { 0xe0001, 0xe0001 }, { 0xe0020, 0xe007f }, - { 0xe0100, 0xe01ef }, - }; - int min = 0; - int max = sizeof(combining) / sizeof(struct interval) - 1; - int mid; - - /* test for 8-bit control characters */ - if (ucs == 0) - return 0; - if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0)) - return -1; - - /* first quick check for Latin-1 etc. characters */ - if (ucs < combining[0].first) - return 1; - - /* binary search in table of non-spacing characters */ - while (max >= min) - { - mid = (min + max) / 2; - if (combining[mid].last < ucs) - min = mid + 1; - else if (combining[mid].first > ucs) - max = mid - 1; - else if (combining[mid].first <= ucs && combining[mid].last >= ucs) - return 0; - } - - /* if we arrive here, ucs is not a combining or C0/C1 control character */ - - /* fast test for majority of non-wide scripts */ - if (ucs < 0x1100) - return 1; - - return 1 + (ucs >= 0x1100 && - (ucs <= 0x115f || /* Hangul Jamo init. consonants */ - ucs == 0x2329 || ucs == 0x232a || - (ucs >= 0x2e80 && ucs <= 0xa4cf && ucs != 0x303f) || /* CJK ... Yi */ - (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */ - (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility Ideographs */ - (ucs >= 0xfe10 && ucs <= 0xfe19) || /* Vertical forms */ - (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */ - (ucs >= 0xff00 && ucs <= 0xff60) || /* Fullwidth Forms */ - (ucs >= 0xffe0 && ucs <= 0xffe6) || (ucs >= 0x20000 && ucs <= 0x2fffd) || - (ucs >= 0x30000 && ucs <= 0x3fffd))); -} -- 2.40.0