From 264790d80dc9aa8110cc987deb4f6fb18a4bb776 Mon Sep 17 00:00:00 2001
From: Jim Warner <james.warner@comcast.net>
Date: Mon, 8 Jan 2018 00:00:00 -0600
Subject: [PATCH] top: adapt utf8 logic to support extra wide characters

Back when top was refactored to support UTF-8 encoding
it was acknowledged that languages like zh_CN were not
supported. That was because a single 'character' might
require more than a single 'column' when it's printed.

Well I've now figured out how to accommodate languages
like that. My adaptation is represented in this patch.

[ and just in case someone wishes to avoid the extra ]
[ runtime costs, a #define OFF_XTRAWIDE is included. ]

Along the way, I've cleaned up some miscellaneous code
supporting the 'Inspect' feature so that the rightmost
screen column was always used rather than being blank.

[ interestingly, my xterm & urxvt terminal emulators ]
[ are able to split extra wide characters then print ]
[ 1/2 of such graphics in the last column. the gnome ]
[ terminal emulator does not duplicate such behavior ]
[ but prints 1 extra character in same width window. ]

Reference(s):
. Sep, 2017 - original utf8 support
commit 9773c56add6446d418c0677f306c8771356f0c01

Signed-off-by: Jim Warner <james.warner@comcast.net>
---
 top/top.c | 57 +++++++++++++++++++++++++++++++++++++------------------
 top/top.h |  2 ++
 2 files changed, 41 insertions(+), 18 deletions(-)

diff --git a/top/top.c b/top/top.c
index 25d5ec88..6e0983ef 100644
--- a/top/top.c
+++ b/top/top.c
@@ -37,6 +37,7 @@
 #include <termios.h>
 #include <time.h>
 #include <unistd.h>
+#include <wchar.h>
 
 #include <sys/ioctl.h>
 #include <sys/resource.h>
@@ -504,6 +505,24 @@ static char UTF8_tab[] = {
 };                                                 //            ( 0xF5 & beyond invalid )
 
 
+        /*
+         * Accommodate any potential differences between some multibyte
+         * character sequence and the screen columns needed to print it */
+static inline int utf8_cols (const unsigned char *p, int n) {
+#ifndef OFF_XTRAWIDE
+   wchar_t wc;
+   int wlen;
+
+   (void)mbtowc(&wc, (const char *)p, n);
+   if ((wlen = wcwidth(wc)) < 1) wlen = 1;
+   return wlen;
+#else
+   (void)p; (void)n;
+   return 1;
+#endif
+} // end: utf8_cols
+
+
         /*
          * Determine difference between total bytes versus printable
          * characters in that passed, potentially multi-byte, string */
@@ -514,8 +533,8 @@ static int utf8_delta (const char *str) {
    while (*p) {
       // -1 represents a decoding error, pretend it's untranslated ...
       if (0 > (clen = UTF8_tab[*p])) return 0;
+      cnum += utf8_cols(p, clen);
       p += clen;
-      ++cnum;
    }
    return (int)((const char *)p - str) - cnum;
 } // end: utf8_delta
@@ -532,8 +551,8 @@ static int utf8_embody (const char *str, int width) {
       while (*p) {
          // -1 represents a decoding error, pretend it's untranslated ...
          if (0 > (clen = UTF8_tab[*p])) return width;
+         if (width < (cnum += utf8_cols(p, clen))) break;
          p += clen;
-         if (++cnum >= width) break;
       }
    }
    return (int)((const char *)p - str);
@@ -2636,15 +2655,15 @@ static void insp_find_str (int ch, int *col, int *row) {
          * while visible search matches display with capclr_hdr for emphasis.
          * ( we hide ugly plumbing in macros to concentrate on the algorithm ) */
 static void insp_mkrow_raw (int col, int row) {
- #define maxSZ ( Screen_cols - (to + 1) )
+ #define maxSZ ( Screen_cols - to )
  #define capNO { if (hicap) { putp(Caps_off); hicap = 0; } }
  #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \
     fr += Insp_sel->flen -1; to += Insp_sel->flen; hicap = 0; }
 #ifndef INSP_JUSTNOT
- #define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \
-    PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 2; hicap = 1; }
- #define mkUNP { int x = maxSZ; const char *p = fmtmk("<%02X>", uch); \
-    PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", x, p); to += 4; hicap = 1; }
+ #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \
+    PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 2; hicap = 1; }
+ #define mkUNP { const char *p = fmtmk("<%02X>", uch); \
+    PUTT("%s%.*s", (!hicap) ? Curwin->capclr_msg : "", maxSZ, p); to += 4; hicap = 1; }
 #else
  #define mkCTL { if ((to += 2) <= Screen_cols) \
     PUTT("%s^%c", (!hicap) ? Curwin->capclr_msg : "", uch + '@'); hicap = 1; }
@@ -2653,7 +2672,7 @@ static void insp_mkrow_raw (int col, int row) {
 #endif
  #define mkSTD { capNO; if (++to <= Screen_cols) { static char _str[2]; \
     _str[0] = uch; putp(_str); } }
-   char tline[SCREENMAX];
+   unsigned char tline[SCREENMAX];
    int fr, to, ofs;
    int hicap = 0;
 
@@ -2661,7 +2680,7 @@ static void insp_mkrow_raw (int col, int row) {
       memcpy(tline, Insp_p[row] + col, sizeof(tline));
    else tline[0] = '\n';
 
-   for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; fr++) {
+   for (fr = 0, to = 0, ofs = 0; to < Screen_cols; fr++) {
       if (!ofs)
          ofs = insp_find_ofs(col + fr, row);
       if (col + fr < ofs) {
@@ -2694,20 +2713,20 @@ static void insp_mkrow_raw (int col, int row) {
          * characters will then be displayed in two positions like '^A'.
          * ( assuming they can even get past those 'gettext' utilities ) */
 static void insp_mkrow_utf8 (int col, int row) {
- #define maxSZ ( Screen_cols - (to + 1) )
+ #define maxSZ ( Screen_cols - to )
  #define mkFND { PUTT("%s%.*s%s", Curwin->capclr_hdr, maxSZ, Insp_sel->fstr, Caps_off); \
     fr += Insp_sel->flen; to += Insp_sel->flen; }
 #ifndef INSP_JUSTNOT
- #define mkCTL { int x = maxSZ; const char *p = fmtmk("^%c", uch + '@'); \
-    PUTT("%s%.*s%s", Curwin->capclr_msg, x, p, Caps_off); to += 2; }
+ #define mkCTL { const char *p = fmtmk("^%c", uch + '@'); \
+    PUTT("%s%.*s%s", Curwin->capclr_msg, maxSZ, p, Caps_off); to += 2; }
 #else
  #define mkCTL { if ((to += 2) <= Screen_cols) \
     PUTT("%s^%c%s", Curwin->capclr_msg, uch + '@', Caps_off); }
 #endif
  #define mkNUL { buf1[0] = ' '; doPUT(buf1) }
- #define doPUT(buf) if (++to <= Screen_cols) putp(buf);
+ #define doPUT(buf) if ((to += cno) <= Screen_cols) putp(buf);
    static char buf1[2], buf2[3], buf3[4], buf4[5];
-   char tline[BIGBUFSIZ];
+   unsigned char tline[BIGBUFSIZ];
    int fr, to, ofs;
 
    col = utf8_proper_col(Insp_p[row], col, 1);
@@ -2715,15 +2734,17 @@ static void insp_mkrow_utf8 (int col, int row) {
       memcpy(tline, Insp_p[row] + col, sizeof(tline));
    else tline[0] = '\n';
 
-   for (fr = 0, to = 0, ofs = 0; to < Screen_cols -1; ) {
+   for (fr = 0, to = 0, ofs = 0; to < Screen_cols; ) {
       if (!ofs)
          ofs = insp_find_ofs(col + fr, row);
       if (col + fr < ofs) {
-         unsigned char uch = tline[fr++];
-         switch (UTF8_tab[(int)uch]) {
+         unsigned char uch = tline[fr];
+         int bno = UTF8_tab[uch];
+         int cno = utf8_cols(&tline[fr++], bno);
+         switch (bno) {
             case 1:
                if (uch == '\n') break;
-               else if (uch < 32) mkCTL
+               if (uch < 32) mkCTL
                else if (uch == 127) mkNUL
                else { buf1[0] = uch; doPUT(buf1) }
                break;
diff --git a/top/top.h b/top/top.h
index 214f6076..456d5909 100644
--- a/top/top.h
+++ b/top/top.h
@@ -39,6 +39,7 @@
 //#define OFF_SCROLLBK            /* disable tty emulators scrollback buffer */
 //#define OFF_STDERROR            /* disable our stderr buffering (redirect) */
 //#define OFF_STDIOLBF            /* disable our own stdout _IOFBF override  */
+//#define OFF_XTRAWIDE            /* disable our extra wide multi-byte logic */
 //#define PRETENDNOCAP            /* use a terminal without essential caps   */
 //#define QUICK_GRAPHS            /* use fast algorithm, accept +2% distort  */
 //#define RCFILE_NOERR            /* rcfile errs silently default, vs. fatal */
@@ -553,6 +554,7 @@ typedef struct WIN_t {
 //atic void          sig_resize (int dont_care_sig);
 /*------  Special UTF-8 Multi-Byte support  ------------------------------*/
 /*atic char          UTF8_tab[] = { ... }                                 */
+//atic inline int    utf8_cols (const unsigned char *p, int n);
 //atic int           utf8_delta (const char *str);
 //atic int           utf8_embody (const char *str, int width);
 //atic const char   *utf8_justify (const char *str, int width, int justr);
-- 
2.40.0