From f4a896c20900f5ce3c290941afe5b53560f34196 Mon Sep 17 00:00:00 2001
From: =?utf8?q?Gustavo=20Andr=C3=A9=20dos=20Santos=20Lopes?=
 <cataphract@php.net>
Date: Mon, 11 Oct 2010 22:26:10 +0000
Subject: [PATCH] - PHP uses a big endian representation when it converts the  
 code unit sequences to integers so as to store the entity   maps. Code in
 traverse_for_entities assumed little   endian. Fixed.   (in practice, due to
 the absence of unicode and entity   mappings for multi-byte encodings --
 except UTF-8 --, this   doesn't matter, so the relevant code was commented
 out for   performance reasons).

---
 ext/standard/html.c | 69 ++++++++++++++++++++++++++++++++++++---------
 1 file changed, 56 insertions(+), 13 deletions(-)

diff --git a/ext/standard/html.c b/ext/standard/html.c
index 0ad34e52c4..9ef6d0158f 100644
--- a/ext/standard/html.c
+++ b/ext/standard/html.c
@@ -456,7 +456,7 @@ det_charset:
 /* }}} */
 
 /* {{{ php_utf32_utf8 */
-size_t php_utf32_utf8(unsigned char *buf, int k)
+static size_t php_utf32_utf8(unsigned char *buf, unsigned k)
 {
 	size_t retval = 0;
 
@@ -487,6 +487,47 @@ size_t php_utf32_utf8(unsigned char *buf, int k)
 }
 /* }}} */
 
+/* {{{ php_mb2_int_to_char
+ * Convert back big endian int representation of sequence of one or two 8-bit code units. */
+static size_t php_mb2_int_to_char(unsigned char *buf, unsigned k)
+{
+	assert(k <= 0xFFFFU);
+	/* one or two bytes */
+	if (k <= 0xFFU) { /* 1 */
+		buf[0] = k;
+		return 1U;
+	} else { /* 2 */
+		buf[0] = k >> 8;
+		buf[1] = k & 0xFFU;
+		return 2U;
+	}
+}
+/* }}} */
+
+/* {{{ php_mb3_int_to_char
+ * Convert back big endian int representation of sequence of one to three 8-bit code units.
+ * For EUC-JP. */
+static size_t php_mb3_int_to_char(unsigned char *buf, unsigned k)
+{
+	assert(k <= 0xFFFFFFU);
+	/* one to three bytes */
+	if (k <= 0xFFU) { /* 1 */
+		buf[0] = k;
+		return 1U;
+	} else if (k <= 0xFFFFU) { /* 2 */
+		buf[0] = k >> 8;
+		buf[1] = k & 0xFFU;
+		return 2U;
+	} else {
+		buf[0] = k >> 16;
+		buf[1] = (k >> 8) & 0xFFU;
+		buf[2] = k & 0xFFU;
+		return 3U;
+	}
+}
+/* }}} */
+
+
 /* {{{ unimap_bsearc_cmp
  * Binary search of unicode code points in unicode <--> charset mapping.
  * Returns the code point in the target charset (whose mapping table was given) or 0 if
@@ -817,21 +858,23 @@ static void traverse_for_entities(char *ret, int *retlen_p, int all, int quote_s
 		case cs_big5hkscs:
 		case cs_sjis:
 		case cs_gb2312:
-			/* one or two bytes */
-			*(q++) = (code & 0xFFU);
-			if (0xFF00U & code) { /* 2 */
-				*(q++) = (code >> 8);
-			}
+			/* we don't have named entity or unicode mappings for these yet,
+			 * so we're guaranteed code <= 0xFF */
+#if 0
+			q += php_mb2_int_to_char((unsigned char*)q, code);
+#else
+			assert(code <= 0xFFU);
+			*(q++) = code;
+#endif
 			break;
 
 		case cs_eucjp:
-			/* one to three bytes */
-			*(q++) = code & 0xFFU;
-			if (0xFFFF00U & code) { /* 2 */
-				*(q++) = ((code >> 8) & 0xFFU);
-				if (0xFF0000U & code) /* 3 */
-					*(q++) = (code >> 16);
-			}
+#if 0 /* idem */
+			q += php_mb2_int_to_char((unsigned char*)q, code);
+#else
+			assert(code <= 0xFFU);
+			*(q++) = code;
+#endif
 			break;
 
 		default:
-- 
2.40.0