]> granicus.if.org Git - icu/commitdiff
ICU-11507 Regex, fix problem with backrefs of unpaired surrogates.
authorAndy Heninger <andy.heninger@gmail.com>
Tue, 6 Oct 2015 23:59:28 +0000 (23:59 +0000)
committerAndy Heninger <andy.heninger@gmail.com>
Tue, 6 Oct 2015 23:59:28 +0000 (23:59 +0000)
X-SVN-Rev: 38038

icu4c/source/i18n/rematch.cpp
icu4c/source/test/testdata/regextst.txt

index c7aeac015ff3e99e9496f3a247f0e44ccbf419da..4d4aa0534f557dc7c43d42c234c05fcd73d1e4cf 100644 (file)
@@ -5232,6 +5232,12 @@ GC_Done:
                         break;
                     }
                 }
+                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
+                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
+                    // Capture group ended with an unpaired lead surrogate.
+                    // Back reference is not permitted to match lead only of a surrogatge pair.
+                    success = FALSE;
+                }
                 if (success) {
                     fp->fInputIdx = inputIndex;
                 } else {
index 15d13bf156ea48750832fad9fe20a82e383c78e3..9a7a6013284db7fcab338545b36f3557fd335a22 100644 (file)
 
 "(?<=((0123456789){100000}){3000})abc"  E  "abc"
 
-
+# Bug 11507  Capture of an unpaired surrogate shouldn't allow a back reference to 
+#            match half of a surrogate pair, but only another unpaired surrogate.
+# 
+"pre(.)post\1"                  "pre\ud800post\ud800\udc00"
+"pre(.)post\1"                  "<0>pre<1>\ud800</1>post\ud800</0> fin"
+"pre(.)post\1"          i       "pre\ud800post\ud800\udc00"         # case insensiteve backrefs take a different code path
+"pre(.)post\1"          i       "<0>pre<1>\ud800</1>post\ud800</0> fin"
 
 #  Random debugging, Temporary
 #