]> granicus.if.org Git - php/commitdiff
- Add support for chinese encodings to htmlentities/htmlspecialchars
authorWez Furlong <wez@php.net>
Thu, 23 Aug 2001 10:43:15 +0000 (10:43 +0000)
committerWez Furlong <wez@php.net>
Thu, 23 Aug 2001 10:43:15 +0000 (10:43 +0000)
  (patch from Alan Knowles <alan_k@hklc.com>)

ext/standard/html.c

index 68c29c4e6c87d40a00b1baa1ba53a0d8cfaecd79..857ea0ccde8640978e63a26fa4a48c87a69c0e3a 100644 (file)
@@ -35,7 +35,7 @@
    Defaults to ISO-8859-1 for now. */
 
 enum entity_charset { cs_terminator, cs_8859_1, cs_cp1252,
-       cs_8859_15, cs_utf_8 };
+       cs_8859_15, cs_utf_8, cs_big5, cs_gb2312, cs_big5hkscs };
 typedef const char * entity_table_t;
 
 /* codepage 1252 is a Windows extension to iso-8859-1. */
@@ -91,11 +91,14 @@ struct html_entity_map {
 };
 
 static const struct html_entity_map entity_map[] = {
-       { cs_cp1252,    0x80, 0x9f, ent_cp_1252 },
-       { cs_cp1252,    0xa0, 0xff, ent_iso_8859_1 },
+       { cs_cp1252,            0x80, 0x9f, ent_cp_1252 },
+       { cs_cp1252,            0xa0, 0xff, ent_iso_8859_1 },
        { cs_8859_1,            0xa0, 0xff, ent_iso_8859_1 },
        { cs_8859_15,           0xa0, 0xff, ent_iso_8859_15 },
        { cs_utf_8,             0xa0, 0xff, ent_iso_8859_1 },
+       { cs_big5,                      0xa0, 0xff, ent_iso_8859_1 },
+       { cs_gb2312,            0xa0, 0xff, ent_iso_8859_1 },
+       { cs_big5hkscs,         0xa0, 0xff, ent_iso_8859_1 },
        { cs_terminator }
 };
 
@@ -107,6 +110,9 @@ static const struct {
        { "ISO-8859-15",        cs_8859_15 },
        { "utf-8",                      cs_utf_8 },
        { "cp1252",             cs_cp1252 },
+       { "BIG5",                       cs_big5 },
+       { "GB2312",                     cs_gb2312 },
+       { "BIG5-HKSCS",         cs_big5hkscs },
        { NULL }
 };
 
@@ -125,86 +131,111 @@ inline static unsigned short get_next_char(enum entity_charset charset,
        
        mbseq[mbpos++] = (unsigned char)this_char;
        
-       if (charset == cs_utf_8)        {
-               unsigned long utf = 0;
-               int stat = 0;
-               int more = 1;
+       switch(charset) {
+               case cs_utf_8:
+                       {
+                               unsigned long utf = 0;
+                               int stat = 0;
+                               int more = 1;
 
-               /* unpack utf-8 encoding into a wide char.
-                * Code stolen from the mbstring extension */
-               
-               do {
-                       if (this_char < 0x80)   {
-                               more = 0;
-                               break;
-                       }
-                       else if (this_char < 0xc0)      {
-                               switch(stat)    {
-                                       case 0x10:      /* 2, 2nd */
-                                       case 0x21:      /* 3, 3rd */
-                                       case 0x32:      /* 4, 4th */
-                                       case 0x43:      /* 5, 5th */
-                                       case 0x54:      /* 6, 6th */
-                                               /* last byte in sequence */
+                               /* unpack utf-8 encoding into a wide char.
+                                * Code stolen from the mbstring extension */
+
+                               do {
+                                       if (this_char < 0x80)   {
                                                more = 0;
-                                               utf |= (this_char & 0x3f);
-                                               this_char = (unsigned short)utf;
-                                               break;
-                                       case 0x20:      /* 3, 2nd */
-                                       case 0x31:      /* 4, 3rd */
-                                       case 0x42:      /* 5, 4th */
-                                       case 0x53:      /* 6, 5th */
-                                               /* penultimate char */
-                                               utf |= ((this_char & 0x3f) << 6);
-                                               stat++;
-                                               break;
-                                       case 0x30:      /* 4, 2nd */
-                                       case 0x41:      /* 5, 3rd */
-                                       case 0x52:      /* 6, 4th */
-                                               utf |= ((this_char & 0x3f) << 12);
-                                               stat++;
                                                break;
-                                       case 0x40:      /* 5, 2nd */
-                                       case 0x51:
-                                               utf |= ((this_char & 0x3f) << 18);
-                                               stat++;
-                                               break;
-                                       case 0x50:      /* 6, 2nd */
-                                               utf |= ((this_char & 0x3f) << 24);
-                                               stat++;
-                                       default:
-                                               /* invalid */
+                                       }
+                                       else if (this_char < 0xc0)      {
+                                               switch(stat)    {
+                                                       case 0x10:      /* 2, 2nd */
+                                                       case 0x21:      /* 3, 3rd */
+                                                       case 0x32:      /* 4, 4th */
+                                                       case 0x43:      /* 5, 5th */
+                                                       case 0x54:      /* 6, 6th */
+                                                               /* last byte in sequence */
+                                                               more = 0;
+                                                               utf |= (this_char & 0x3f);
+                                                               this_char = (unsigned short)utf;
+                                                               break;
+                                                       case 0x20:      /* 3, 2nd */
+                                                       case 0x31:      /* 4, 3rd */
+                                                       case 0x42:      /* 5, 4th */
+                                                       case 0x53:      /* 6, 5th */
+                                                               /* penultimate char */
+                                                               utf |= ((this_char & 0x3f) << 6);
+                                                               stat++;
+                                                               break;
+                                                       case 0x30:      /* 4, 2nd */
+                                                       case 0x41:      /* 5, 3rd */
+                                                       case 0x52:      /* 6, 4th */
+                                                               utf |= ((this_char & 0x3f) << 12);
+                                                               stat++;
+                                                               break;
+                                                       case 0x40:      /* 5, 2nd */
+                                                       case 0x51:
+                                                               utf |= ((this_char & 0x3f) << 18);
+                                                               stat++;
+                                                               break;
+                                                       case 0x50:      /* 6, 2nd */
+                                                               utf |= ((this_char & 0x3f) << 24);
+                                                               stat++;
+                                                       default:
+                                                               /* invalid */
+                                                               more = 0;
+                                               }
+                                       }
+                                       /* lead byte */
+                                       else if (this_char < 0xe0) {
+                                               stat = 0x10;    /* 2 byte */
+                                               utf = (this_char & 0x1f) << 6;
+                                       } else if (this_char < 0xf0)    {
+                                               stat = 0x20;    /* 3 byte */
+                                               utf = (this_char & 0xf) << 12;
+                                       } else if (this_char < 0xf8) {
+                                               stat = 0x30;    /* 4 byte */
+                                               utf = (this_char & 0x7) << 18;
+                                       } else if (this_char < 0xfc)    {
+                                               stat = 0x40;    /* 5 byte */
+                                               utf = (this_char & 0x3) << 24;
+                                       } else if (this_char < 0xfe)    {
+                                               stat = 0x50;    /* 6 byte */
+                                               utf = (this_char & 0x1) << 30;
+                                       }
+                                       else    {
+                                               /* invalid; bail */
                                                more = 0;
-                               }
-                       }
-                       /* lead byte */
-                       else if (this_char < 0xe0) {
-                               stat = 0x10;    /* 2 byte */
-                               utf = (this_char & 0x1f) << 6;
-                       } else if (this_char < 0xf0)    {
-                               stat = 0x20;    /* 3 byte */
-                               utf = (this_char & 0xf) << 12;
-                       } else if (this_char < 0xf8) {
-                               stat = 0x30;    /* 4 byte */
-                               utf = (this_char & 0x7) << 18;
-                       } else if (this_char < 0xfc)    {
-                               stat = 0x40;    /* 5 byte */
-                               utf = (this_char & 0x3) << 24;
-                       } else if (this_char < 0xfe)    {
-                               stat = 0x50;    /* 6 byte */
-                               utf = (this_char & 0x1) << 30;
-                       }
-                       else    {
-                               /* invalid; bail */
-                               more = 0;
-                               break;
+                                               break;
+                                       }
+                                       if (more)
+                                       {
+                                               this_char = str[pos++];
+                                               mbseq[mbpos++] = (unsigned char)this_char;
+                                       }
+                               } while(more);
                        }
-                       if (more)
+                       break;
+               case cs_big5:
+               case cs_gb2312:
+               case cs_big5hkscs:
                        {
-                               this_char = str[pos++];
-                               mbseq[mbpos++] = (unsigned char)this_char;
+                               /* check if this is the first of a 2-byte sequence */
+                               if (this_char >= 0xa1 && this_char <= 0xf9)     {
+                                       /* peek at the next char */
+                                       unsigned char next_char = str[pos];
+                                       if ((next_char >= 0x40 && next_char <= 0x73) ||
+                                                       (next_char >= 0xa1 && next_char <= 0xfe))
+                                       {
+                                               /* yes, this a wide char */
+                                               this_char <<= 8;
+                                               mbseq[mbpos++] = next_char;
+                                               this_char |= next_char;
+                                               pos++;
+                                       }
+                                       
+                               }
+                               break;
                        }
-               } while(more);
        }
        *newpos = pos;
        mbseq[mbpos] = '\0';
@@ -222,7 +253,7 @@ static enum entity_charset determine_charset(char * charset_hint)
        enum entity_charset charset = cs_8859_1;
        int len;
 
-       /* Guarantee default behaviour */
+       /* Guarantee default behaviour for backwards compatibility */
        if (charset_hint == NULL)
                return cs_8859_1;
 
@@ -296,11 +327,9 @@ PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newle
        i = 0;
        while (i < oldlen) {
                int mbseqlen;
-               unsigned char mbsequence[16];   /* allow up to 15 characters
-                                                                                                       in a multibyte sequence
-                                                                                                       it should be more than enough.. */
+               unsigned char mbsequence[16];   /* allow up to 15 characters in a multibyte sequence */
                unsigned short this_char = get_next_char(charset, old, &i, mbsequence, &mbseqlen);
-               int matches_map = 0;
+               int matches_map;
                
                if (len + 9 > maxlen)
                        new = erealloc (new, maxlen += 128);
@@ -309,7 +338,9 @@ PHPAPI char *php_escape_html_entities(unsigned char *old, int oldlen, int *newle
                        /* look for a match in the maps for this charset */
                        int j;
                        unsigned char * rep;
-               
+       
+                       matches_map = 0;
+
                        for (j=0; entity_map[j].charset != cs_terminator; j++)  {
                                if (entity_map[j].charset == charset
                                                && this_char >= entity_map[j].basechar