substitute html-entities (e.g. { &) in UTF8 encoded labels

author ellson <devnull@localhost>

Thu, 27 Jul 2006 16:35:39 +0000 (16:35 +0000)

committer ellson <devnull@localhost>

Thu, 27 Jul 2006 16:35:39 +0000 (16:35 +0000)
author ellson <devnull@localhost>
Thu, 27 Jul 2006 16:35:39 +0000 (16:35 +0000)
committer ellson <devnull@localhost>
Thu, 27 Jul 2006 16:35:39 +0000 (16:35 +0000)
diff --git a/lib/common/labels.c b/lib/common/labels.c

index 9a2ff911221ed517954c3826e1d9ad24511283ec..25ef8b037be182e303cf50e89afd9b4917dc2686 100644 (file)
--- a/lib/common/labels.c
+++ b/lib/common/labels.c
@@ -105,11 +105,18 @@ static pointf label_size(graph_t * g, textlabel_t * lp)
  void
  size_label (graph_t* g, textlabel_t* rv)
  {
-    if (GD_charset(g->root) == CHAR_LATIN1) {
-       char* lstr = latin1ToUTF8(rv->text);
-       free(rv->text);
-       rv->text = lstr;
+    char *s;
+
+    switch (GD_charset(g->root)) {
+    case CHAR_LATIN1:
+       s = latin1ToUTF8(rv->text);
+       break;
+    default: /* UTF8 */
+       s = htmlEntityUTF8(rv->text);
+       break;
      }
+    free(rv->text);
+    rv->text = s;
      label_size(g, rv);
  }
  
diff --git a/lib/common/utils.c b/lib/common/utils.c

index 1ff27370414f949a4bb52c8d0364bded6561b64a..df934dc4356425d8596e926d50aeccd735c646ee 100644 (file)
--- a/lib/common/utils.c
+++ b/lib/common/utils.c
@@ -1247,17 +1247,15 @@ safe_dcl(graph_t * g, void *obj, char *name, char *def,
  }
  
  static int comp_entities(const void *e1, const void *e2) {
-  struct entities_s *en1 = (struct entities_s *) e1;
-  struct entities_s *en2 = (struct entities_s *) e2;
-  return strcmp(en1->name, en2->name);
+  return strcmp(((struct entities_s *)e1)->name, ((struct entities_s *)e2)->name);
  }
  
  #define MAXENTLEN 8
  
  /* scanEntity:
- *  * Scan non-numeric entity, convert to &#...; form and store in xbuf.
- *   * t points to first char after '&'. Return after final semicolon.
- *    * If unknown, we return t and let libexpat flag the error.
+ * Scan non-numeric entity, convert to &#...; form and store in xbuf.
+ * t points to first char after '&'. Return after final semicolon.
+ * If unknown, we return t and let libexpat flag the error.
   *     */
  char* scanEntity (char* t, agxbuf* xb)
  {
@@ -1355,13 +1353,87 @@ htmlEntity (char** s)
      return n;
  }
  
+/* substitute html entities like: &#123; and: &amp; for the UTF8 equivalents */
+char* htmlEntityUTF8 (char* s)
+{
+    char*  ns;
+    agxbuf xb;
+    unsigned char buf[BUFSIZ];
+    unsigned char c;
+    unsigned int v;
+    int rc;
+
+    agxbinit(&xb, BUFSIZ, buf);
+
+    while ((c = *(unsigned char*)s++)) {
+        if (c < 0xC0) {
+           /*
+            * Handles properly formed UTF-8 characters between
+            * 0x01 and 0x7F.  Also treats \0 and naked trail
+            * bytes 0x80 to 0xBF as valid characters representing
+            * themselves.
+            */
+           if (c == '&') {
+               /* replace html entity sequences like: &amp;
+                * and: &#123; with their UTF8 equivalents */
+               v = htmlEntity (&s);
+               if (v) {
+                   if (v < 0x7F) /* entity needs 1 byte in UTF8 */
+                       c = v;
+                   else if (v < 0x07FF) { /* entity needs 2 bytes in UTF8 */
+                       rc = agxbputc(&xb, (v >> 6) | 0xC0);
+                       c = (v & 0x3F) | 0x80;
+                   }
+                   else { /* entity needs 3 bytes in UTF8 */
+                       rc = agxbputc(&xb, (v >> 12) | 0xE0);
+                       rc = agxbputc(&xb, ((v >> 6) & 0x3F) | 0x80);
+                       c = (v & 0x3F) | 0x80;
+                   }
+               }
+               else {
+                   c = '&';
+               }
+            }
+       }
+        else if (c < 0xE0) { /* copy 2 byte UTF8 characters */
+           if ((s[1] & 0xC0) == 0x80) {
+               rc = agxbputc(&xb, c);
+               c = *(unsigned char*)s++;
+           }
+           /*
+            * A two-byte-character lead-byte not followed by trail-byte
+            * represents itself.
+            */
+       }
+       else if (c < 0xF0) { /* copy 3 byte UTF8 characters */
+           if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) {
+               rc = agxbputc(&xb, c);
+               c = *(unsigned char*)s++;
+               rc = agxbputc(&xb, c);
+               c = *(unsigned char*)s++;
+           }
+           /*
+            * A three-byte-character lead-byte not followed by
+            * two trail-bytes represents itself.
+            */
+       }
+       else  {
+           /* UTF8 codes > 3 bytes not supported */
+           assert (0);
+        }
+       rc = agxbputc(&xb, c);
+    }
+    ns = strdup (agxbuse(&xb));
+    agxbfree(&xb);
+    return ns;
+}
+
  /* latin1ToUTF8:
   * Converts string from Latin1 encoding to utf8
   * Also translates HTML entities.
   *
   */
-char*
-latin1ToUTF8 (char* s)
+char* latin1ToUTF8 (char* s)
  {
      char*  ns;
      agxbuf xb;
diff --git a/lib/common/utils.h b/lib/common/utils.h

index 6a8773f0cf1e74636595adbca5bf823ccc1405fd..86861685b4467b852f6a944e14aa46c10108b49c 100644 (file)
--- a/lib/common/utils.h
+++ b/lib/common/utils.h
@@ -68,6 +68,7 @@ extern "C" {
               attrsym_t * (*fun) (Agraph_t *, char *, char *));
  
      extern char *latin1ToUTF8(char *);
+    extern char *htmlEntityUTF8(char *);
      extern char* utf8ToLatin1 (char* ins);
      extern char* scanEntity (char* t, agxbuf* xb);
author	ellson <devnull@localhost>
	Thu, 27 Jul 2006 16:35:39 +0000 (16:35 +0000)
committer	ellson <devnull@localhost>
	Thu, 27 Jul 2006 16:35:39 +0000 (16:35 +0000)
lib/common/labels.c		patch \| blob \| history
lib/common/utils.c		patch \| blob \| history
lib/common/utils.h		patch \| blob \| history