]> granicus.if.org Git - graphviz/commitdiff
UTF-8 codes up to 4 bytes
authorGlen Low <glen.low@pixelglow.com>
Thu, 5 Dec 2013 11:00:31 +0000 (19:00 +0800)
committerGlen Low <glen.low@pixelglow.com>
Thu, 5 Dec 2013 11:00:31 +0000 (19:00 +0800)
* Refactor for arbitrary number of bytes

lib/common/utils.c

index efbb3f49a1115e97a74350b75fadc823883559e0..2bd0fe7989f4bc5542124524bf323cfade6084dd 100644 (file)
@@ -1410,6 +1410,9 @@ char* htmlEntityUTF8 (char* s, graph_t* g)
     unsigned int v;
     int ignored;
 
+    int uc;
+    int ui;
+
     NOTUSED(ignored);
 
     if (lastg != g) {
@@ -1420,14 +1423,30 @@ char* htmlEntityUTF8 (char* s, graph_t* g)
     agxbinit(&xb, BUFSIZ, buf);
 
     while ((c = *(unsigned char*)s++)) {
-        if (c < 0xC0) {
+        if (c < 0xC0)
            /*
             * Handles properly formed UTF-8 characters between
             * 0x01 and 0x7F.  Also treats \0 and naked trail
             * bytes 0x80 to 0xBF as valid characters representing
             * themselves.
             */
-           if (c == '&') {
+            uc = 0;
+        else if (c < 0xE0)
+            uc = 1;
+        else if (c < 0xF0)
+            uc = 2;
+        else if (c < 0xF8)
+            uc = 3;
+        else {
+            uc = -1;
+            if (!warned) {
+                agerr(AGWARN, "UTF8 codes > 4 bytes are not currently supported (graph %s) - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
+                warned = 1;
+            }
+            c = cvtAndAppend (c, &xb);
+        }
+
+           if (uc == 0 && c == '&') {
                /* replace html entity sequences like: &amp;
                 * and: &#123; with their UTF8 equivalents */
                v = htmlEntity (&s);
@@ -1443,45 +1462,23 @@ char* htmlEntityUTF8 (char* s, graph_t* g)
                        ignored = agxbputc(&xb, ((v >> 6) & 0x3F) | 0x80);
                        c = (v & 0x3F) | 0x80;
                    }
-               }
-            }
-       }
-        else if (c < 0xE0) { /* copy 2 byte UTF8 characters */
-           if ((*s & 0xC0) == 0x80) {
-               ignored = agxbputc(&xb, c);
-               c = *(unsigned char*)s++;
-           }
-           else { 
-               if (!warned) {
-                   agerr(AGWARN, "Invalid 2-byte UTF8 found in input of graph %s - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
-                   warned = 1;
-               }
-               c = cvtAndAppend (c, &xb);
-           }
-       }
-       else if (c < 0xF0) { /* copy 3 byte UTF8 characters */
-           if (((*s & 0xC0) == 0x80) && ((s[1] & 0xC0) == 0x80)) {
-               ignored = agxbputc(&xb, c);
-               c = *(unsigned char*)s++;
-               ignored = agxbputc(&xb, c);
-               c = *(unsigned char*)s++;
-           }
-           else {
-               if (!warned) {
-                   agerr(AGWARN, "Invalid 3-byte UTF8 found in input of graph %s - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
-                   warned = 1;
-               }
-               c = cvtAndAppend (c, &xb);
-           }
-       }
-       else  {
-           if (!warned) {
-               agerr(AGWARN, "UTF8 codes > 3 bytes are not currently supported (graph %s) - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", agnameof(g));
-               warned = 1;
-           }
-           c = cvtAndAppend (c, &xb);
+                   }
         }
-       ignored = agxbputc(&xb, c);
+        else /* copy n byte UTF8 characters */
+            for (ui = 0; ui < uc; ++ui)
+                if ((*s & 0xC0) == 0x80) {
+                    ignored = agxbputc(&xb, c);
+                    c = *(unsigned char*)s++;
+                }
+                else { 
+                           if (!warned) {
+                               agerr(AGWARN, "Invalid %d-byte UTF8 found in input of graph %s - treated as Latin-1. Perhaps \"-Gcharset=latin1\" is needed?\n", uc + 1, agnameof(g));
+                               warned = 1;
+                           }
+                           c = cvtAndAppend (c, &xb);
+                    break;
+                   }
+           ignored = agxbputc(&xb, c);
     }
     ns = strdup (agxbuse(&xb));
     agxbfree(&xb);