From 264790d80dc9aa8110cc987deb4f6fb18a4bb776 Mon Sep 17 00:00:00 2001 From: Jim Warner Date: Mon, 8 Jan 2018 00:00:00 -0600 Subject: [PATCH] top: adapt utf8 logic to support extra wide characters Back when top was refactored to support UTF-8 encoding it was acknowledged that languages like zh_CN were not supported. That was because a single 'character' might require more than a single 'column' when it's printed. Well I've now figured out how to accommodate languages like that. My adaptation is represented in this patch. [ and just in case someone wishes to avoid the extra ] [ runtime costs, a #define OFF_XTRAWIDE is included. ] Along the way, I've cleaned up some miscellaneous code supporting the 'Inspect' feature so that the rightmost screen column was always used rather than being blank. [ interestingly, my xterm & urxvt terminal emulators ] [ are able to split extra wide characters then print ] [ 1/2 of such graphics in the last column. the gnome ] [ terminal emulator does not duplicate such behavior ] [ but prints 1 extra character in same width window. ] Reference(s): . Sep, 2017 - original utf8 support commit 9773c56add6446d418c0677f306c8771356f0c01 Signed-off-by: Jim Warner --- top/top.c | 57 +++++++++++++++++++++++++++++++++++++------------------ top/top.h | 2 ++ 2 files changed, 41 insertions(+), 18 deletions(-) diff --git a/top/top.c b/top/top.c index 25d5ec88..6e0983ef 100644 --- a/top/top.c +++ b/top/top.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -504,6 +505,24 @@ static char UTF8_tab[] = { }; // ( 0xF5 & beyond invalid ) + /* + * Accommodate any potential differences between some multibyte + * character sequence and the screen columns needed to print it */ +static inline int utf8_cols (const unsigned char *p, int n) { +#ifndef OFF_XTRAWIDE + wchar_t wc; + int wlen; + + (void)mbtowc(&wc, (const char *)p, n); + if ((wlen = wcwidth(wc)) < 1) wlen = 1; + return wlen; +#else + (void)p; (void)n; + return 1; +#endif +} // end: utf8_cols + + /* * Determine difference between total bytes versus printable * characters in that passed, potentially multi-byte, string */ @@ -514,8 +533,8 @@ static int utf8_delta (const char *str) { while (*p) { // -1 represents a decoding error, pretend it's untranslated ... if (0 > (clen = UTF8_tab[*p])) return 0; + cnum += utf8_cols(p, clen); p += clen; - ++cnum; } return (int)((const char *)p - str) - cnum; } // end: utf8_delta @@ -532,8 +551,8 @@ static int utf8_embody (const char *str, int width) { while (*p) { // -1 represents a decoding error, pretend it's untranslated ... if (0 > (clen = UTF8_tab[*p])) return width; + if (width < (cnum += utf8_cols(p, clen))) break; p += clen; - if (++cnum >= width) break; } } return (int)((const char *)p - str); @@ -2636,15 +2655,15 @@ static void insp_find_str (int ch, int *col, int *row) { * while visible search matches display with capclr_hdr for emphasis. * ( we hide ugly plumbing in macros to concentrate on the algorithm ) */ static void insp_mkrow_raw (int col, int row) { - #define maxSZ ( Screen_cols - (to + 1) ) + #define maxSZ ( Screen_cols - to ) #define capNO { if (hicap) { putp(Caps_off); hicap = 0; } } #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \ fr += Insp_sel->flen -1; to += Insp_sel->flen; hicap = 0; } #ifndef INSP_JUSTNOT - #define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \ - PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 2; hicap = 1; } - #define mkUNP { int x = maxSZ; const char *p = fmtmk("<%02X>", uch); \ - PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 4; hicap = 1; } + #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \ + PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 2; hicap = 1; } + #define mkUNP { const char *p = fmtmk("<%02X>", uch); \ + PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 4; hicap = 1; } #else #define mkCTL { if ((to += 2) <= Screen_cols) \ PUTT("%s^%c", (!hicap) ? Curwin->capclr_msg : "", uch + '@'); hicap = 1; } @@ -2653,7 +2672,7 @@ static void insp_mkrow_raw (int col, int row) { #endif #define mkSTD { capNO; if (++to <= Screen_cols) { static char _str[2]; \ _str[0] = uch; putp(_str); } } - char tline[SCREENMAX]; + unsigned char tline[SCREENMAX]; int fr, to, ofs; int hicap = 0; @@ -2661,7 +2680,7 @@ static void insp_mkrow_raw (int col, int row) { memcpy(tline, Insp_p[row] + col, sizeof(tline)); else tline[0] = '\n'; - for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; fr++) { + for (fr = 0, to = 0, ofs = 0; to < Screen_cols; fr++) { if (!ofs) ofs = insp_find_ofs(col + fr, row); if (col + fr < ofs) { @@ -2694,20 +2713,20 @@ static void insp_mkrow_raw (int col, int row) { * characters will then be displayed in two positions like '^A'. * ( assuming they can even get past those 'gettext' utilities ) */ static void insp_mkrow_utf8 (int col, int row) { - #define maxSZ ( Screen_cols - (to + 1) ) + #define maxSZ ( Screen_cols - to ) #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \ fr += Insp_sel->flen; to += Insp_sel->flen; } #ifndef INSP_JUSTNOT - #define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \ - PUTT("%s%.*s%s", Curwin->capclr_msg, x, p, Caps_off); to += 2; } + #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \ + PUTT("%s%.*s%s", Curwin->capclr_msg, maxSZ, p, Caps_off); to += 2; } #else #define mkCTL { if ((to += 2) <= Screen_cols) \ PUTT("%s^%c%s", Curwin->capclr_msg, uch + '@', Caps_off); } #endif #define mkNUL { buf1[0] = ' '; doPUT(buf1) } - #define doPUT(buf) if (++to <= Screen_cols) putp(buf); + #define doPUT(buf) if ((to += cno) <= Screen_cols) putp(buf); static char buf1[2], buf2[3], buf3[4], buf4[5]; - char tline[BIGBUFSIZ]; + unsigned char tline[BIGBUFSIZ]; int fr, to, ofs; col = utf8_proper_col(Insp_p[row], col, 1); @@ -2715,15 +2734,17 @@ static void insp_mkrow_utf8 (int col, int row) { memcpy(tline, Insp_p[row] + col, sizeof(tline)); else tline[0] = '\n'; - for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; ) { + for (fr = 0, to = 0, ofs = 0; to < Screen_cols; ) { if (!ofs) ofs = insp_find_ofs(col + fr, row); if (col + fr < ofs) { - unsigned char uch = tline[fr++]; - switch (UTF8_tab[(int)uch]) { + unsigned char uch = tline[fr]; + int bno = UTF8_tab[uch]; + int cno = utf8_cols(&tline[fr++], bno); + switch (bno) { case 1: if (uch == '\n') break; - else if (uch < 32) mkCTL + if (uch < 32) mkCTL else if (uch == 127) mkNUL else { buf1[0] = uch; doPUT(buf1) } break; diff --git a/top/top.h b/top/top.h index 214f6076..456d5909 100644 --- a/top/top.h +++ b/top/top.h @@ -39,6 +39,7 @@ //#define OFF_SCROLLBK /* disable tty emulators scrollback buffer */ //#define OFF_STDERROR /* disable our stderr buffering (redirect) */ //#define OFF_STDIOLBF /* disable our own stdout _IOFBF override */ +//#define OFF_XTRAWIDE /* disable our extra wide multi-byte logic */ //#define PRETENDNOCAP /* use a terminal without essential caps */ //#define QUICK_GRAPHS /* use fast algorithm, accept +2% distort */ //#define RCFILE_NOERR /* rcfile errs silently default, vs. fatal */ @@ -553,6 +554,7 @@ typedef struct WIN_t { //atic void sig_resize (int dont_care_sig); /*------ Special UTF-8 Multi-Byte support ------------------------------*/ /*atic char UTF8_tab[] = { ... } */ +//atic inline int utf8_cols (const unsigned char *p, int n); //atic int utf8_delta (const char *str); //atic int utf8_embody (const char *str, int width); //atic const char *utf8_justify (const char *str, int width, int justr); -- 2.40.0