From: ellson Date: Thu, 27 Jul 2006 16:35:39 +0000 (+0000) Subject: substitute html-entities (e.g. { &) in UTF8 encoded labels X-Git-Tag: LAST_LIBGRAPH~32^2~6030 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=1f462aec71017c9aaa86d4a57bcae01fe9778f2e;p=graphviz substitute html-entities (e.g. { &) in UTF8 encoded labels --- diff --git a/lib/common/labels.c b/lib/common/labels.c index 9a2ff9112..25ef8b037 100644 --- a/lib/common/labels.c +++ b/lib/common/labels.c @@ -105,11 +105,18 @@ static pointf label_size(graph_t * g, textlabel_t * lp) void size_label (graph_t* g, textlabel_t* rv) { - if (GD_charset(g->root) == CHAR_LATIN1) { - char* lstr = latin1ToUTF8(rv->text); - free(rv->text); - rv->text = lstr; + char *s; + + switch (GD_charset(g->root)) { + case CHAR_LATIN1: + s = latin1ToUTF8(rv->text); + break; + default: /* UTF8 */ + s = htmlEntityUTF8(rv->text); + break; } + free(rv->text); + rv->text = s; label_size(g, rv); } diff --git a/lib/common/utils.c b/lib/common/utils.c index 1ff273704..df934dc43 100644 --- a/lib/common/utils.c +++ b/lib/common/utils.c @@ -1247,17 +1247,15 @@ safe_dcl(graph_t * g, void *obj, char *name, char *def, } static int comp_entities(const void *e1, const void *e2) { - struct entities_s *en1 = (struct entities_s *) e1; - struct entities_s *en2 = (struct entities_s *) e2; - return strcmp(en1->name, en2->name); + return strcmp(((struct entities_s *)e1)->name, ((struct entities_s *)e2)->name); } #define MAXENTLEN 8 /* scanEntity: - * * Scan non-numeric entity, convert to &#...; form and store in xbuf. - * * t points to first char after '&'. Return after final semicolon. - * * If unknown, we return t and let libexpat flag the error. + * Scan non-numeric entity, convert to &#...; form and store in xbuf. + * t points to first char after '&'. Return after final semicolon. + * If unknown, we return t and let libexpat flag the error. * */ char* scanEntity (char* t, agxbuf* xb) { @@ -1355,13 +1353,87 @@ htmlEntity (char** s) return n; } +/* substitute html entities like: { and: & for the UTF8 equivalents */ +char* htmlEntityUTF8 (char* s) +{ + char* ns; + agxbuf xb; + unsigned char buf[BUFSIZ]; + unsigned char c; + unsigned int v; + int rc; + + agxbinit(&xb, BUFSIZ, buf); + + while ((c = *(unsigned char*)s++)) { + if (c < 0xC0) { + /* + * Handles properly formed UTF-8 characters between + * 0x01 and 0x7F. Also treats \0 and naked trail + * bytes 0x80 to 0xBF as valid characters representing + * themselves. + */ + if (c == '&') { + /* replace html entity sequences like: & + * and: { with their UTF8 equivalents */ + v = htmlEntity (&s); + if (v) { + if (v < 0x7F) /* entity needs 1 byte in UTF8 */ + c = v; + else if (v < 0x07FF) { /* entity needs 2 bytes in UTF8 */ + rc = agxbputc(&xb, (v >> 6) | 0xC0); + c = (v & 0x3F) | 0x80; + } + else { /* entity needs 3 bytes in UTF8 */ + rc = agxbputc(&xb, (v >> 12) | 0xE0); + rc = agxbputc(&xb, ((v >> 6) & 0x3F) | 0x80); + c = (v & 0x3F) | 0x80; + } + } + else { + c = '&'; + } + } + } + else if (c < 0xE0) { /* copy 2 byte UTF8 characters */ + if ((s[1] & 0xC0) == 0x80) { + rc = agxbputc(&xb, c); + c = *(unsigned char*)s++; + } + /* + * A two-byte-character lead-byte not followed by trail-byte + * represents itself. + */ + } + else if (c < 0xF0) { /* copy 3 byte UTF8 characters */ + if (((s[1] & 0xC0) == 0x80) && ((s[2] & 0xC0) == 0x80)) { + rc = agxbputc(&xb, c); + c = *(unsigned char*)s++; + rc = agxbputc(&xb, c); + c = *(unsigned char*)s++; + } + /* + * A three-byte-character lead-byte not followed by + * two trail-bytes represents itself. + */ + } + else { + /* UTF8 codes > 3 bytes not supported */ + assert (0); + } + rc = agxbputc(&xb, c); + } + ns = strdup (agxbuse(&xb)); + agxbfree(&xb); + return ns; +} + /* latin1ToUTF8: * Converts string from Latin1 encoding to utf8 * Also translates HTML entities. * */ -char* -latin1ToUTF8 (char* s) +char* latin1ToUTF8 (char* s) { char* ns; agxbuf xb; diff --git a/lib/common/utils.h b/lib/common/utils.h index 6a8773f0c..86861685b 100644 --- a/lib/common/utils.h +++ b/lib/common/utils.h @@ -68,6 +68,7 @@ extern "C" { attrsym_t * (*fun) (Agraph_t *, char *, char *)); extern char *latin1ToUTF8(char *); + extern char *htmlEntityUTF8(char *); extern char* utf8ToLatin1 (char* ins); extern char* scanEntity (char* t, agxbuf* xb);