/* {{{ get_next_char
*/
-static unsigned int get_next_char(
+static inline unsigned int get_next_char(
enum entity_charset charset,
- unsigned char *str,
+ const unsigned char *str,
size_t str_len,
size_t *cursor,
int *status)
}
/* }}} */
+/* {{{ php_next_utf8_char
+ * Public interface for get_next_char used with UTF-8 */
+ PHPAPI unsigned int php_next_utf8_char(
+ const unsigned char *str,
+ size_t str_len,
+ size_t *cursor,
+ int *status)
+{
+ return get_next_char(cs_utf_8, str, str_len, cursor, status);
+}
+/* }}} */
+
/* {{{ entity_charset determine_charset
* returns the charset identifier based on current locale or a hint.
* defaults to UTF-8 */
PHPAPI char *php_escape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
PHPAPI char *php_escape_html_entities_ex(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset, zend_bool double_encode TSRMLS_DC);
PHPAPI char *php_unescape_html_entities(unsigned char *old, size_t oldlen, size_t *newlen, int all, int flags, char *hint_charset TSRMLS_DC);
+PHPAPI unsigned int php_next_utf8_char(const unsigned char *str, size_t str_len, size_t *cursor, int *status);
#endif /* HTML_H */
--- /dev/null
+--TEST--\r
+Bug #49687 Several utf8_decode deficiencies and vulnerabilities\r
+--SKIPIF--\r
+<?php\r
+require_once("skipif.inc");\r
+if (!extension_loaded('xml')) die ("skip xml extension not available");\r
+?>\r
+--FILE--\r
+<?php\r
+\r
+$tests = array(\r
+ "\x41\xC2\x3E\x42",\r
+ "\xE3\x80\x22",\r
+ "\x41\x98\xBA\x42\xE2\x98\x43\xE2\x98\xBA\xE2\x98",\r
+);\r
+foreach ($tests as $t) {\r
+ echo bin2hex(utf8_decode($t)), "\n";\r
+}\r
+echo "Done.\n";\r
+--EXPECT--\r
+413f3e42\r
+3f22\r
+413f3f423f433f3f\r
+Done.\r
#include "zend_variables.h"
#include "ext/standard/php_string.h"
#include "ext/standard/info.h"
+#include "ext/standard/html.h"
#if HAVE_XML
/* {{{ xml_utf8_decode */
PHPAPI char *xml_utf8_decode(const XML_Char *s, int len, int *newlen, const XML_Char *encoding)
{
- int pos = len;
+ size_t pos = 0;
char *newbuf = emalloc(len + 1);
unsigned int c;
char (*decoder)(unsigned short) = NULL;
newbuf[*newlen] = '\0';
return newbuf;
}
- while (pos > 0) {
- c = (unsigned char)(*s);
- if (c >= 0xf0) { /* four bytes encoded, 21 bits */
- if(pos-4 >= 0) {
- c = ((s[0]&7)<<18) | ((s[1]&63)<<12) | ((s[2]&63)<<6) | (s[3]&63);
- } else {
- c = '?';
- }
- s += 4;
- pos -= 4;
- } else if (c >= 0xe0) { /* three bytes encoded, 16 bits */
- if(pos-3 >= 0) {
- c = ((s[0]&63)<<12) | ((s[1]&63)<<6) | (s[2]&63);
- } else {
- c = '?';
- }
- s += 3;
- pos -= 3;
- } else if (c >= 0xc0) { /* two bytes encoded, 11 bits */
- if(pos-2 >= 0) {
- c = ((s[0]&63)<<6) | (s[1]&63);
- } else {
- c = '?';
- }
- s += 2;
- pos -= 2;
- } else {
- s++;
- pos--;
+
+ while (pos < (size_t)len) {
+ int status = FAILURE;
+ c = php_next_utf8_char((const unsigned char*)s, (size_t) len, &pos, &status);
+
+ if (status == FAILURE || c > 0xFFU) {
+ c = '?';
}
+
newbuf[*newlen] = decoder ? decoder(c) : c;
++*newlen;
}