From c2997af2fd5b2987fa84ad0b3a542a8d185c10c3 Mon Sep 17 00:00:00 2001 From: Moriyoshi Koizumi Date: Sun, 11 Oct 2009 23:52:33 +0000 Subject: [PATCH] - A couple more fix for my previous fix. (one of the fix by Arnaud Le Blanc. Thanks!) --- ext/standard/html.c | 68 +++++++++++++------ ext/standard/tests/strings/bug49785.phpt | 54 +++++++++++---- .../tests/strings/htmlentities-utf-2.phpt | 12 ++-- .../tests/strings/htmlentities-utf.phpt | 12 ++-- 4 files changed, 99 insertions(+), 47 deletions(-) diff --git a/ext/standard/html.c b/ext/standard/html.c index 3ca73cce43..bfd4946557 100644 --- a/ext/standard/html.c +++ b/ext/standard/html.c @@ -489,11 +489,26 @@ struct basic_entities_dec { } \ mbseq[mbpos++] = (mbchar); } +/* skip one byte and return */ +#define MB_FAILURE(pos) do { \ + *newpos = pos + 1; \ + *status = FAILURE; \ + return 0; \ +} while (0) + #define CHECK_LEN(pos, chars_need) \ - if((str_len - (pos)) < chars_need) { \ - *newpos = pos; \ - *status = FAILURE; \ - return 0; \ + if (chars_need < 1) { \ + if((str_len - (pos)) < chars_need) { \ + *newpos = pos; \ + *status = FAILURE; \ + return 0; \ + } \ + } else { \ + if((str_len - (pos)) < chars_need) { \ + *newpos = pos + 1; \ + *status = FAILURE; \ + return 0; \ + } \ } /* {{{ get_next_char @@ -532,10 +547,12 @@ inline static unsigned int get_next_char(enum entity_charset charset, this_char = c; pos++; } else if (c < 0xc0) { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } else if (c < 0xe0) { CHECK_LEN(pos, 2); + if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { + MB_FAILURE(pos); + } this_char = ((c & 0x1f) << 6) | (str[pos + 1] & 0x3f); if (this_char < 0x80) { *status = FAILURE; @@ -546,10 +563,15 @@ inline static unsigned int get_next_char(enum entity_charset charset, pos += 2; } else if (c < 0xf0) { CHECK_LEN(pos, 3); + if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { + MB_FAILURE(pos); + } + if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { + MB_FAILURE(pos); + } this_char = ((c & 0x0f) << 12) | ((str[pos + 1] & 0x3f) << 6) | (str[pos + 2] & 0x3f); if (this_char < 0x800) { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } MB_WRITE((unsigned char)c); MB_WRITE((unsigned char)str[pos + 1]); @@ -557,10 +579,18 @@ inline static unsigned int get_next_char(enum entity_charset charset, pos += 3; } else if (c < 0xf8) { CHECK_LEN(pos, 4); + if (str[pos + 1] < 0x80 || str[pos + 1] > 0xbf) { + MB_FAILURE(pos); + } + if (str[pos + 2] < 0x80 || str[pos + 2] > 0xbf) { + MB_FAILURE(pos); + } + if (str[pos + 3] < 0x80 || str[pos + 3] > 0xbf) { + MB_FAILURE(pos); + } this_char = ((c & 0x07) << 18) | ((str[pos + 1] & 0x3f) << 12) | ((str[pos + 2] & 0x3f) << 6) | (str[pos + 3] & 0x3f); if (this_char < 0x10000) { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } MB_WRITE((unsigned char)c); MB_WRITE((unsigned char)str[pos + 1]); @@ -568,8 +598,7 @@ inline static unsigned int get_next_char(enum entity_charset charset, MB_WRITE((unsigned char)str[pos + 3]); pos += 4; } else { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } } break; @@ -591,8 +620,7 @@ inline static unsigned int get_next_char(enum entity_charset charset, MB_WRITE(next_char); this_char = (this_char << 8) | next_char; } else { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } } else { MB_WRITE(this_char); @@ -617,8 +645,7 @@ inline static unsigned int get_next_char(enum entity_charset charset, MB_WRITE(next_char); this_char = (this_char << 8) | next_char; } else { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } } else { MB_WRITE(this_char); @@ -640,8 +667,7 @@ inline static unsigned int get_next_char(enum entity_charset charset, MB_WRITE(next_char); this_char = (this_char << 8) | next_char; } else { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } } else if (this_char == 0x8e) { /* peek at the next char */ @@ -653,8 +679,7 @@ inline static unsigned int get_next_char(enum entity_charset charset, MB_WRITE(next_char); this_char = (this_char << 8) | next_char; } else { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } } else if (this_char == 0x8f) { /* peek at the next two char */ @@ -671,8 +696,7 @@ inline static unsigned int get_next_char(enum entity_charset charset, MB_WRITE(next2_char); this_char = (this_char << 16) | (next_char << 8) | next_char; } else { - *status = FAILURE; - return 0; + MB_FAILURE(pos); } } else { MB_WRITE(this_char); diff --git a/ext/standard/tests/strings/bug49785.phpt b/ext/standard/tests/strings/bug49785.phpt index eb4ad8b743..f344855931 100644 --- a/ext/standard/tests/strings/bug49785.phpt +++ b/ext/standard/tests/strings/bug49785.phpt @@ -7,19 +7,33 @@ function _bin2hex($val) { } // UTF-8: basic tests -var_dump(bin2hex(htmlentities("\xc1\xbf", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xc2\x80", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xce\x91", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xce\xb1", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xdf\xbf", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xe0\xa0\x80", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xe0\x9f\xbf", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xe2\x99\xa5", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xef\xbf\xbf", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xf0\x8f\xbf\xbf", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xf0\x90\x80\x80", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xf7\xbf\xbf\xbf", ENT_QUOTES, "UTF-8"))); -var_dump(bin2hex(htmlentities("\xf8\x88\x80\x80\x80", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xc1\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xc2\x80", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xc2\x00", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xc2\xc0", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xce\x91", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xce\xb1", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xdf\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe0\xa0\x80", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe0\x9f\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe0\x9f\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe0\x1f\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe0\x9f\x3f", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe0\x1f\x3f", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xe2\x99\xa5", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xef\xbf\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xef\xff\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xef\xbf\xff", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf0\x8f\xbf\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf0\x90\x80\x80", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\xbf\xbf\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\x3f\xbf\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\xbf\x3f\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\xbf\xbf\x3f", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\xff\xbf\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\xbf\xff\xbf", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf7\xbf\xbf\xff", ENT_QUOTES, "UTF-8"))); +var_dump(_bin2hex(htmlentities("\xf8\x88\x80\x80\x80", ENT_QUOTES, "UTF-8"))); echo "--\n"; // UTF-8: alternative (invalid) UTF-8 sequence @@ -115,17 +129,31 @@ foreach (array_map('chr', range(0x81, 0xfe)) as $c) { --EXPECT-- string(0) "" string(4) "c280" +string(0) "" +string(0) "" string(14) "26416c7068613b" string(14) "26616c7068613b" string(4) "dfbf" string(6) "e0a080" string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" string(16) "266865617274733b" string(6) "efbfbf" string(0) "" +string(0) "" +string(0) "" string(8) "f0908080" string(8) "f7bfbfbf" string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" +string(0) "" -- string(0) "" string(0) "" diff --git a/ext/standard/tests/strings/htmlentities-utf-2.phpt b/ext/standard/tests/strings/htmlentities-utf-2.phpt index a80100cb10..c5f4ac4ea6 100755 --- a/ext/standard/tests/strings/htmlentities-utf-2.phpt +++ b/ext/standard/tests/strings/htmlentities-utf-2.phpt @@ -36,8 +36,8 @@ foreach($strings as $string) { %unicode|string%(0) "" %unicode|string%(2) "79" %unicode|string%(2) "79" -%unicode|string%(8) "2667743b" -%unicode|string%(8) "2667743b" +%unicode|string%(0) "" +%unicode|string%(0) "" %unicode|string%(8) "566f696c" %unicode|string%(8) "566f696c" %unicode|string%(12) "436c69636873" @@ -52,10 +52,10 @@ foreach($strings as $string) { %unicode|string%(2) "79" %unicode|string%(8) "f7bfbfbf" %unicode|string%(8) "f7bfbfbf" -%unicode|string%(10) "fbbfbfbfbf" -%unicode|string%(10) "fbbfbfbfbf" -%unicode|string%(12) "fdbfbfbfbfbf" -%unicode|string%(12) "fdbfbfbfbfbf" +%unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" %unicode|string%(4) "4142" %unicode|string%(4) "4142" %unicode|string%(4) "4242" diff --git a/ext/standard/tests/strings/htmlentities-utf.phpt b/ext/standard/tests/strings/htmlentities-utf.phpt index b85803a163..1daafc61d8 100755 --- a/ext/standard/tests/strings/htmlentities-utf.phpt +++ b/ext/standard/tests/strings/htmlentities-utf.phpt @@ -36,8 +36,8 @@ foreach($strings as $string) { %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" -%unicode|string%(8) "2667743b" -%unicode|string%(8) "2667743b" +%unicode|string%(0) "" +%unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" @@ -52,10 +52,10 @@ foreach($strings as $string) { %unicode|string%(0) "" %unicode|string%(8) "f7bfbfbf" %unicode|string%(8) "f7bfbfbf" -%unicode|string%(10) "fbbfbfbfbf" -%unicode|string%(10) "fbbfbfbfbf" -%unicode|string%(12) "fdbfbfbfbfbf" -%unicode|string%(12) "fdbfbfbfbfbf" +%unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" +%unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" %unicode|string%(0) "" -- 2.40.0