From: K.Kosako Date: Thu, 4 Jul 2019 23:50:07 +0000 (+0900) Subject: Fix for invalid encoded character: strict check surrogate pair of UTF-16LE,BE X-Git-Tag: v6.9.3~67 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=28ae753fe76f93246e6bc7e26a0e3bd39dc0e5e0;p=onig Fix for invalid encoded character: strict check surrogate pair of UTF-16LE,BE --- diff --git a/src/utf16_be.c b/src/utf16_be.c index 22bf74d..7420a6d 100644 --- a/src/utf16_be.c +++ b/src/utf16_be.c @@ -2,7 +2,7 @@ utf16_be.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -103,7 +103,25 @@ utf16be_mbc_enc_len(const UChar* p) static int is_valid_mbc_string(const UChar* s, const UChar* end) { - return onigenc_length_check_is_valid_mbc_string(ONIG_ENCODING_UTF16_BE, s, end); + while (s < end) { + int len = utf16be_mbc_enc_len(s); + if (len == 4) { + if (s + 2 >= end) + return FALSE; + if (! UTF16_IS_SURROGATE_SECOND(*(s+2))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*s)) + return FALSE; + + s += len; + } + + if (s != end) + return FALSE; + else + return TRUE; } static int @@ -243,7 +261,8 @@ utf16be_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*s) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-2))) s -= 2; return (UChar* )s; diff --git a/src/utf16_le.c b/src/utf16_le.c index 4b231c6..c33dabd 100644 --- a/src/utf16_le.c +++ b/src/utf16_le.c @@ -2,7 +2,7 @@ utf16_le.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2018 K.Kosako + * Copyright (c) 2002-2019 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -110,7 +110,16 @@ is_valid_mbc_string(const UChar* p, const UChar* end) const UChar* end1 = end - 1; while (p < end1) { - p += utf16le_mbc_enc_len(p); + int len = utf16le_mbc_enc_len(p); + if (len == 4) { + if (p + 3 < end && ! UTF16_IS_SURROGATE_SECOND(*(p + 3))) + return FALSE; + } + else + if (UTF16_IS_SURROGATE_SECOND(*(p + 1))) + return FALSE; + + p += len; } if (p != end) @@ -252,7 +261,8 @@ utf16le_left_adjust_char_head(const UChar* start, const UChar* s) s--; } - if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1) + if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1 && + UTF16_IS_SURROGATE_FIRST(*(s-1))) s -= 2; return (UChar* )s;