From: K.Kosako <kosako@sofnec.co.jp>
Date: Thu, 4 Jul 2019 23:50:07 +0000 (+0900)
Subject: Fix for invalid encoded character: strict check surrogate pair of UTF-16LE,BE
X-Git-Tag: v6.9.3~67
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=28ae753fe76f93246e6bc7e26a0e3bd39dc0e5e0;p=onig

Fix for invalid encoded character: strict check surrogate pair of UTF-16LE,BE
---

diff --git a/src/utf16_be.c b/src/utf16_be.c
index 22bf74d..7420a6d 100644
--- a/src/utf16_be.c
+++ b/src/utf16_be.c
@@ -2,7 +2,7 @@
   utf16_be.c -  Oniguruma (regular expression library)
 **********************************************************************/
 /*-
- * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p)
 static int
 is_valid_mbc_string(const UChar* s, const UChar* end)
 {
-  return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end);
+  while (s < end) {
+    int len = utf16be_mbc_enc_len(s);
+    if (len == 4) {
+      if (s + 2 >= end)
+        return FALSE;
+      if (! UTF16_IS_SURROGATE_SECOND(*(s+2)))
+        return FALSE;
+    }
+    else
+      if (UTF16_IS_SURROGATE_SECOND(*s))
+        return FALSE;
+
+    s += len;
+  }
+
+  if (s != end)
+    return FALSE;
+  else
+    return TRUE;
 }
 
 static int
@@ -243,7 +261,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s)
     s--;
   }
 
-  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1)
+  if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 &&
+      UTF16_IS_SURROGATE_FIRST(*(s-2)))
     s -= 2;
 
   return (UChar* )s;
diff --git a/src/utf16_le.c b/src/utf16_le.c
index 4b231c6..c33dabd 100644
--- a/src/utf16_le.c
+++ b/src/utf16_le.c
@@ -2,7 +2,7 @@
   utf16_le.c -  Oniguruma (regular expression library)
 **********************************************************************/
 /*-
- * Copyright (c) 2002-2018  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
+ * Copyright (c) 2002-2019  K.Kosako  <sndgk393 AT ybb DOT ne DOT jp>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -110,7 +110,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end)
   const UChar* end1 = end - 1;
 
   while (p < end1) {
-    p += utf16le_mbc_enc_len(p);
+    int len = utf16le_mbc_enc_len(p);
+    if (len == 4) {
+      if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3)))
+        return FALSE;
+    }
+    else
+      if (UTF16_IS_SURROGATE_SECOND(*(p + 1)))
+        return FALSE;
+
+    p += len;
   }
 
   if (p != end)
@@ -252,7 +261,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s)
     s--;
   }
 
-  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)
+  if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 &&
+      UTF16_IS_SURROGATE_FIRST(*(s-1)))
     s -= 2;
 
   return (UChar* )s;