ICU-5312 Regular Expressions Named Capture.

author Andy Heninger <andy.heninger@gmail.com>

Wed, 18 Feb 2015 23:56:19 +0000 (23:56 +0000)

committer Andy Heninger <andy.heninger@gmail.com>

Wed, 18 Feb 2015 23:56:19 +0000 (23:56 +0000)
author Andy Heninger <andy.heninger@gmail.com>
Wed, 18 Feb 2015 23:56:19 +0000 (23:56 +0000)
committer Andy Heninger <andy.heninger@gmail.com>
Wed, 18 Feb 2015 23:56:19 +0000 (23:56 +0000)
diff --git a/icu4c/source/common/unicode/utypes.h b/icu4c/source/common/unicode/utypes.h

index ca27880e25e21ffc3546450cdc074a5a9a2840d1..09037a81d5285c1754347d35adb05f3fc9d2d35d 100644 (file)
--- a/icu4c/source/common/unicode/utypes.h
+++ b/icu4c/source/common/unicode/utypes.h
@@ -1,6 +1,6 @@
  /*
  **********************************************************************
-*   Copyright (C) 1996-2014, International Business Machines
+*   Copyright (C) 1996-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  **********************************************************************
  *
@@ -651,8 +651,9 @@ typedef enum UErrorCode {
      U_REGEX_STOPPED_BY_CALLER,            /**< Matching operation aborted by user callback fn.    */
  #ifndef U_HIDE_DRAFT_API
      U_REGEX_PATTERN_TOO_BIG,              /**< Pattern exceeds limits on size or complexity. @draft ICU 55 */
+    U_REGEX_INVALID_CAPTURE_GROUP_NAME,   /**< Invalid capture group name. @draft ICU 55 */
  #endif  /* U_HIDE_DRAFT_API */
-    U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+2, /**< This must always be the last value to indicate the limit for regexp errors */
+    U_REGEX_ERROR_LIMIT=U_REGEX_STOPPED_BY_CALLER+3, /**< This must always be the last value to indicate the limit for regexp errors */
  
      /*
       * The error code in the range 0x10400-0x104ff are reserved for IDNA related error codes
diff --git a/icu4c/source/common/utypes.c b/icu4c/source/common/utypes.c

index 32b6d880405441086e3cee665156314a21633c9b..6ff846074764009678896e1465e24c5dfaaca368 100644 (file)
--- a/icu4c/source/common/utypes.c
+++ b/icu4c/source/common/utypes.c
@@ -1,7 +1,7 @@
  /*
  ******************************************************************************
  *
-*   Copyright (C) 1997-2014, International Business Machines
+*   Copyright (C) 1997-2015, International Business Machines
  *   Corporation and others.  All Rights Reserved.
  *
  ******************************************************************************
@@ -166,7 +166,8 @@ _uRegexErrorName[U_REGEX_ERROR_LIMIT - U_REGEX_ERROR_START] = {
      "U_REGEX_STACK_OVERFLOW",
      "U_REGEX_TIME_OUT",
      "U_REGEX_STOPPED_BY_CALLER",
-    "U_REGEX_PATTERN_TOO_BIG"
+    "U_REGEX_PATTERN_TOO_BIG",
+    "U_REGEX_INVALID_CAPTURE_GROUP_NAME"
  };
  
  static const char * const
diff --git a/icu4c/source/i18n/regexcmp.cpp b/icu4c/source/i18n/regexcmp.cpp

index c3877a86aaa59e89bb9353c951914a33a44c343c..cd6ca2b2467dc2e2a3885c9041c055384be50d63 100644 (file)
--- a/icu4c/source/i18n/regexcmp.cpp
+++ b/icu4c/source/i18n/regexcmp.cpp
@@ -70,6 +70,7 @@ RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
  
      fMatchOpenParen   = -1;
      fMatchCloseParen  = -1;
+    fCaptureName      = NULL;
  
      if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
          status = rxp->fDeferredStatus;
@@ -86,6 +87,8 @@ static const UChar      chDash      = 0x2d;      // '-'
  //
  //------------------------------------------------------------------------------
  RegexCompile::~RegexCompile() {
+    delete fCaptureName;         // Normally will be NULL, but can exist if pattern
+                                 //   compilation stops with a syntax error.
  }
  
  static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
@@ -286,17 +289,6 @@ void    RegexCompile::compile(
      // The pattern has now been read and processed, and the compiled code generated.
      //
  
-    //
-    // Compute the number of digits requried for the largest capture group number.
-    //
-    fRXPat->fMaxCaptureDigits = 1;
-    int32_t  n = 10;
-    int32_t  groupCount = fRXPat->fGroupMap->size();
-    while (n <= groupCount) {
-        fRXPat->fMaxCaptureDigits++;
-        n *= 10;
-    }
-
      //
      // The pattern's fFrameSize so far has accumulated the requirements for
      //   storage for capture parentheses, counters, etc. that are encountered
@@ -438,8 +430,25 @@ UBool RegexCompile::doParseActions(int32_t action)
          break;
  
  
+    case doBeginNamedCapture:
+        // Scanning (?<letter.
+        //   The first letter of the name will come through again under doConinueNamedCapture.
+        fCaptureName = new UnicodeString();
+        if (fCaptureName == NULL) {
+            error(U_MEMORY_ALLOCATION_ERROR);
+        }
+        break;
+
+    case  doContinueNamedCapture:
+        fCaptureName->append(fC.fChar);
+        break;
+
+    case doBadNamedCapture:
+        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+        break;
+        
      case doOpenCaptureParen:
-        // Open Paren.
+        // Open Capturing Paren, possibly named.
          //   Compile to a
          //      - NOP, which later may be replaced by a save-state if the
          //         parenthesized group gets a * quantifier, followed by
@@ -474,8 +483,18 @@ UBool RegexCompile::doParseActions(int32_t action)
  
              // Save the mapping from group number to stack frame variable position.
              fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
+
+            // If this is a named capture group, add the name->group number mapping.
+            if (fCaptureName != NULL) {
+                int32_t groupNumber = fRXPat->fGroupMap->size();
+                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
+                fCaptureName = NULL;    // hash table takes ownership of the name (key) string.
+                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
+                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+                }
+            }
          }
-         break;
+        break;
  
      case doOpenNonCaptureParen:
          // Open non-caputuring (grouping only) Paren.
@@ -1270,7 +1289,41 @@ UBool RegexCompile::doParseActions(int32_t action)
          }
          break;
  
+    case doBeginNamedBackRef:
+        U_ASSERT(fCaptureName == NULL);
+        fCaptureName = new UnicodeString;
+        if (fCaptureName == NULL) {
+            error(U_MEMORY_ALLOCATION_ERROR);
+        }
+        break;
+            
+    case doContinueNamedBackRef:
+        fCaptureName->append(fC.fChar);
+        break;
  
+    case doCompleteNamedBackRef:
+        {
+        int32_t groupNumber = uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName);
+        if (groupNumber == 0) {
+            // Group name has not been defined.
+            //   Could be a forward reference. If we choose to support them at some
+            //   future time, extra mechanism will be required at this point.
+            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+        } else {
+            // Given the number, handle identically to a \n numbered back reference.
+            // See comments above, under doBackRef
+            fixLiterals(FALSE);
+            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
+                appendOp(URX_BACKREF_I, groupNumber);
+            } else {
+                appendOp(URX_BACKREF, groupNumber);
+            }
+        }
+        delete fCaptureName;
+        fCaptureName = NULL;
+        break;
+        }
+       
      case doPossessivePlus:
          // Possessive ++ quantifier.
          // Compiles to
diff --git a/icu4c/source/i18n/regexcmp.h b/icu4c/source/i18n/regexcmp.h

index c3cc7db02fabf9cb8600c07e0dc4200cf34719bb..50bb3027fc66d4ad708afaa989a94ebd9c0522e8 100644 (file)
--- a/icu4c/source/i18n/regexcmp.h
+++ b/icu4c/source/i18n/regexcmp.h
@@ -1,7 +1,7 @@
  //
  //  regexcmp.h
  //
-//  Copyright (C) 2002-2014, International Business Machines Corporation and others.
+//  Copyright (C) 2002-2015, International Business Machines Corporation and others.
  //  All Rights Reserved.
  //
  //  This file contains declarations for the class RegexCompile
@@ -220,6 +220,9 @@ private:
      UChar32                       fLastSetLiteral;   // The last single code point added to a set.
                                                       //   needed when "-y" is scanned, and we need
                                                       //   to turn "x-y" into a range.
+
+    UnicodeString                *fCaptureName;      // Named Capture, the group name is built up
+                                                     //   in this string while being scanned.
  };
  
  // Constant values to be pushed onto fSetOpStack while scanning & evalueating [set expressions]
diff --git a/icu4c/source/i18n/regexcst.h b/icu4c/source/i18n/regexcst.h

index ab43137d8b60b05c646e6dd5e0d42617ea1b381b..e754be4bd1a32d8e6c1de2e85087f233468a3390 100644 (file)
--- a/icu4c/source/i18n/regexcst.h
+++ b/icu4c/source/i18n/regexcst.h
@@ -5,7 +5,7 @@
  //    It is generated by the Perl script "regexcst.pl" from
  //    the rule parser state definitions file "regexcst.txt".
  //
-//   Copyright (C) 2002-2007 International Business Machines Corporation 
+//   Copyright (C) 2002-2015 International Business Machines Corporation 
  //   and others. All rights reserved.  
  //
  //---------------------------------------------------------------------------------
@@ -17,100 +17,107 @@ U_NAMESPACE_BEGIN
  // Character classes for regex pattern scanning.
  //
      static const uint8_t kRuleSet_digit_char = 128;
-    static const uint8_t kRuleSet_rule_char = 129;
+    static const uint8_t kRuleSet_ascii_letter = 129;
+    static const uint8_t kRuleSet_rule_char = 130;
  
  
  enum Regex_PatternParseAction {
-    doLiteralChar,
-    doSetEnd,
-    doBackslashA,
-    doSetBeginUnion,
-    doNOP,
-    doSetBackslash_w,
-    doSetRange,
-    doBackslashG,
-    doPerlInline,
-    doSetAddDash,
-    doIntevalLowerDigit,
-    doProperty,
-    doBackslashX,
-    doOpenAtomicParen,
-    doSetLiteralEscaped,
-    doPatFinish,
-    doSetBackslash_D,
-    doSetDifference2,
-    doNamedChar,
-    doNGPlus,
+    doIntervalUpperDigit,
+    doPossessiveOpt,
      doOpenLookBehindNeg,
-    doIntervalError,
-    doIntervalSame,
-    doBackRef,
-    doPlus,
-    doOpenCaptureParen,
-    doMismatchedParenErr,
-    doBeginMatchMode,
+    doDotAny,
+    doSetBackslash_D,
+    doSetLiteral,
+    doSetBackslash_S,
      doEscapeError,
-    doOpenNonCaptureParen,
+    doSetBackslash_W,
      doDollar,
-    doSetProp,
-    doIntervalUpperDigit,
-    doSetBegin,
-    doBackslashs,
-    doOpenLookBehind,
+    doBackslashb,
+    doSetOpError,
+    doBackslashG,
+    doPatStart,
+    doMismatchedParenErr,
+    doPossessivePlus,
+    doBackslashX,
+    doSetBackslash_s,
+    doSetBackslash_w,
+    doBackslashW,
+    doBackslashw,
      doSetMatchMode,
      doOrOperator,
-    doCaret,
-    doMatchModeParen,
-    doStar,
-    doOpt,
-    doMatchMode,
-    doSuppressComments,
-    doPossessiveInterval,
      doOpenLookAheadNeg,
-    doBackslashW,
-    doCloseParen,
-    doSetOpError,
+    doOpenLookBehind,
+    doBackslashS,
+    doBeginMatchMode,
+    doNOP,
+    doSetProp,
+    doBackslashA,
      doIntervalInit,
-    doSetFinish,
-    doSetIntersection2,
-    doNGStar,
-    doEnterQuoteMode,
-    doSetAddAmp,
-    doBackslashB,
-    doBackslashw,
-    doPossessiveOpt,
+    doOpenCaptureParen,
+    doNGPlus,
+    doIntervalError,
+    doSetDifference2,
+    doNGOpt,
+    doEscapedLiteralChar,
      doSetNegate,
-    doRuleError,
-    doBackslashb,
-    doConditionalExpr,
-    doPossessivePlus,
+    doSetBegin,
+    doMatchModeParen,
+    doLiteralChar,
+    doOpt,
+    doSetIntersection2,
      doBadOpenParenType,
+    doSuppressComments,
+    doCloseParen,
+    doPatFinish,
+    doSetBeginUnion,
+    doSetBackslash_d,
+    doProperty,
      doNGInterval,
-    doSetLiteral,
-    doSetNamedChar,
-    doBackslashd,
-    doSetBeginDifference1,
-    doBackslashD,
-    doExit,
-    doSetBackslash_S,
+    doNGStar,
+    doOpenLookAhead,
+    doSetBeginIntersection1,
+    doBeginNamedCapture,
      doInterval,
+    doMatchMode,
      doSetNoCloseError,
-    doNGOpt,
+    doSetBeginDifference1,
+    doPlus,
+    doBackslashD,
+    doSetLiteralEscaped,
+    doContinueNamedCapture,
      doSetPosixProp,
-    doBackslashS,
-    doBackslashZ,
-    doSetBeginIntersection1,
-    doSetBackslash_W,
-    doSetBackslash_d,
-    doOpenLookAhead,
-    doBadModeFlag,
-    doPatStart,
+    doBackslashz,
      doSetNamedRange,
      doPossessiveStar,
-    doEscapedLiteralChar,
-    doSetBackslash_s,
-    doBackslashz,
-    doDotAny,
+    doBadModeFlag,
+    doContinueNamedBackRef,
+    doPerlInline,
+    doBackslashd,
+    doOpenNonCaptureParen,
+    doSetEnd,
+    doSetAddDash,
+    doSetFinish,
+    doCaret,
+    doConditionalExpr,
+    doExit,
+    doNamedChar,
+    doSetRange,
+    doPossessiveInterval,
+    doBackslashs,
+    doIntervalSame,
+    doEnterQuoteMode,
+    doOpenAtomicParen,
+    doSetNamedChar,
+    doRuleError,
+    doStar,
+    doSetAddAmp,
+    doBackslashB,
+    doCompleteNamedBackRef,
+    doBackslashZ,
+    doIntevalLowerDigit,
+    doBeginNamedBackRef,
+    doBackRef,
+    doBadNamedCapture,
      rbbiLastAction};
  
  //-------------------------------------------------------------------------------
@@ -132,21 +139,21 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
      {doNOP, 0, 0, 0, TRUE}
      , {doPatStart, 255, 2,0,  FALSE}     //  1      start
      , {doLiteralChar, 254, 14,0,  TRUE}     //  2      term
-    , {doLiteralChar, 129, 14,0,  TRUE}     //  3 
-    , {doSetBegin, 91 /* [ */, 104, 182, TRUE}     //  4 
+    , {doLiteralChar, 130, 14,0,  TRUE}     //  3 
+    , {doSetBegin, 91 /* [ */, 118, 196, TRUE}     //  4 
      , {doNOP, 40 /* ( */, 27,0,  TRUE}     //  5 
      , {doDotAny, 46 /* . */, 14,0,  TRUE}     //  6 
      , {doCaret, 94 /* ^ */, 14,0,  TRUE}     //  7 
      , {doDollar, 36 /* $ */, 14,0,  TRUE}     //  8 
-    , {doNOP, 92 /* \ */, 84,0,  TRUE}     //  9 
+    , {doNOP, 92 /* \ */, 89,0,  TRUE}     //  9 
      , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  10 
      , {doCloseParen, 41 /* ) */, 255,0,  TRUE}     //  11 
      , {doPatFinish, 253, 2,0,  FALSE}     //  12 
-    , {doRuleError, 255, 183,0,  FALSE}     //  13 
-    , {doNOP, 42 /* * */, 63,0,  TRUE}     //  14      expr-quant
-    , {doNOP, 43 /* + */, 66,0,  TRUE}     //  15 
-    , {doNOP, 63 /* ? */, 69,0,  TRUE}     //  16 
-    , {doIntervalInit, 123 /* { */, 72,0,  TRUE}     //  17 
+    , {doRuleError, 255, 197,0,  FALSE}     //  13 
+    , {doNOP, 42 /* * */, 68,0,  TRUE}     //  14      expr-quant
+    , {doNOP, 43 /* + */, 71,0,  TRUE}     //  15 
+    , {doNOP, 63 /* ? */, 74,0,  TRUE}     //  16 
+    , {doIntervalInit, 123 /* { */, 77,0,  TRUE}     //  17 
      , {doNOP, 40 /* ( */, 23,0,  TRUE}     //  18 
      , {doNOP, 255, 20,0,  FALSE}     //  19 
      , {doOrOperator, 124 /* | */, 2,0,  TRUE}     //  20      expr-cont
@@ -154,7 +161,7 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
      , {doNOP, 255, 2,0,  FALSE}     //  22 
      , {doSuppressComments, 63 /* ? */, 25,0,  TRUE}     //  23      open-paren-quant
      , {doNOP, 255, 27,0,  FALSE}     //  24 
-    , {doNOP, 35 /* # */, 49, 14, TRUE}     //  25      open-paren-quant2
+    , {doNOP, 35 /* # */, 50, 14, TRUE}     //  25      open-paren-quant2
      , {doNOP, 255, 29,0,  FALSE}     //  26 
      , {doSuppressComments, 63 /* ? */, 29,0,  TRUE}     //  27      open-paren
      , {doOpenCaptureParen, 255, 2, 14, FALSE}     //  28 
@@ -163,156 +170,170 @@ static const struct RegexTableEl gRuleParseStateTable[] = {
      , {doOpenLookAhead, 61 /* = */, 2, 20, TRUE}     //  31 
      , {doOpenLookAheadNeg, 33 /* ! */, 2, 20, TRUE}     //  32 
      , {doNOP, 60 /* < */, 46,0,  TRUE}     //  33 
-    , {doNOP, 35 /* # */, 49, 2, TRUE}     //  34 
-    , {doBeginMatchMode, 105 /* i */, 52,0,  FALSE}     //  35 
-    , {doBeginMatchMode, 100 /* d */, 52,0,  FALSE}     //  36 
-    , {doBeginMatchMode, 109 /* m */, 52,0,  FALSE}     //  37 
-    , {doBeginMatchMode, 115 /* s */, 52,0,  FALSE}     //  38 
-    , {doBeginMatchMode, 117 /* u */, 52,0,  FALSE}     //  39 
-    , {doBeginMatchMode, 119 /* w */, 52,0,  FALSE}     //  40 
-    , {doBeginMatchMode, 120 /* x */, 52,0,  FALSE}     //  41 
-    , {doBeginMatchMode, 45 /* - */, 52,0,  FALSE}     //  42 
-    , {doConditionalExpr, 40 /* ( */, 183,0,  TRUE}     //  43 
-    , {doPerlInline, 123 /* { */, 183,0,  TRUE}     //  44 
-    , {doBadOpenParenType, 255, 183,0,  FALSE}     //  45 
+    , {doNOP, 35 /* # */, 50, 2, TRUE}     //  34 
+    , {doBeginMatchMode, 105 /* i */, 53,0,  FALSE}     //  35 
+    , {doBeginMatchMode, 100 /* d */, 53,0,  FALSE}     //  36 
+    , {doBeginMatchMode, 109 /* m */, 53,0,  FALSE}     //  37 
+    , {doBeginMatchMode, 115 /* s */, 53,0,  FALSE}     //  38 
+    , {doBeginMatchMode, 117 /* u */, 53,0,  FALSE}     //  39 
+    , {doBeginMatchMode, 119 /* w */, 53,0,  FALSE}     //  40 
+    , {doBeginMatchMode, 120 /* x */, 53,0,  FALSE}     //  41 
+    , {doBeginMatchMode, 45 /* - */, 53,0,  FALSE}     //  42 
+    , {doConditionalExpr, 40 /* ( */, 197,0,  TRUE}     //  43 
+    , {doPerlInline, 123 /* { */, 197,0,  TRUE}     //  44 
+    , {doBadOpenParenType, 255, 197,0,  FALSE}     //  45 
      , {doOpenLookBehind, 61 /* = */, 2, 20, TRUE}     //  46      open-paren-lookbehind
      , {doOpenLookBehindNeg, 33 /* ! */, 2, 20, TRUE}     //  47 
-    , {doBadOpenParenType, 255, 183,0,  FALSE}     //  48 
-    , {doNOP, 41 /* ) */, 255,0,  TRUE}     //  49      paren-comment
-    , {doMismatchedParenErr, 253, 183,0,  FALSE}     //  50 
-    , {doNOP, 255, 49,0,  TRUE}     //  51 
-    , {doMatchMode, 105 /* i */, 52,0,  TRUE}     //  52      paren-flag
-    , {doMatchMode, 100 /* d */, 52,0,  TRUE}     //  53 
-    , {doMatchMode, 109 /* m */, 52,0,  TRUE}     //  54 
-    , {doMatchMode, 115 /* s */, 52,0,  TRUE}     //  55 
-    , {doMatchMode, 117 /* u */, 52,0,  TRUE}     //  56 
-    , {doMatchMode, 119 /* w */, 52,0,  TRUE}     //  57 
-    , {doMatchMode, 120 /* x */, 52,0,  TRUE}     //  58 
-    , {doMatchMode, 45 /* - */, 52,0,  TRUE}     //  59 
-    , {doSetMatchMode, 41 /* ) */, 2,0,  TRUE}     //  60 
-    , {doMatchModeParen, 58 /* : */, 2, 14, TRUE}     //  61 
-    , {doBadModeFlag, 255, 183,0,  FALSE}     //  62 
-    , {doNGStar, 63 /* ? */, 20,0,  TRUE}     //  63      quant-star
-    , {doPossessiveStar, 43 /* + */, 20,0,  TRUE}     //  64 
-    , {doStar, 255, 20,0,  FALSE}     //  65 
-    , {doNGPlus, 63 /* ? */, 20,0,  TRUE}     //  66      quant-plus
-    , {doPossessivePlus, 43 /* + */, 20,0,  TRUE}     //  67 
-    , {doPlus, 255, 20,0,  FALSE}     //  68 
-    , {doNGOpt, 63 /* ? */, 20,0,  TRUE}     //  69      quant-opt
-    , {doPossessiveOpt, 43 /* + */, 20,0,  TRUE}     //  70 
-    , {doOpt, 255, 20,0,  FALSE}     //  71 
-    , {doNOP, 128, 74,0,  FALSE}     //  72      interval-open
-    , {doIntervalError, 255, 183,0,  FALSE}     //  73 
-    , {doIntevalLowerDigit, 128, 74,0,  TRUE}     //  74      interval-lower
-    , {doNOP, 44 /* , */, 78,0,  TRUE}     //  75 
-    , {doIntervalSame, 125 /* } */, 81,0,  TRUE}     //  76 
-    , {doIntervalError, 255, 183,0,  FALSE}     //  77 
-    , {doIntervalUpperDigit, 128, 78,0,  TRUE}     //  78      interval-upper
-    , {doNOP, 125 /* } */, 81,0,  TRUE}     //  79 
-    , {doIntervalError, 255, 183,0,  FALSE}     //  80 
-    , {doNGInterval, 63 /* ? */, 20,0,  TRUE}     //  81      interval-type
-    , {doPossessiveInterval, 43 /* + */, 20,0,  TRUE}     //  82 
-    , {doInterval, 255, 20,0,  FALSE}     //  83 
-    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  84      backslash
-    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  85 
-    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  86 
-    , {doBackslashd, 100 /* d */, 14,0,  TRUE}     //  87 
-    , {doBackslashD, 68 /* D */, 14,0,  TRUE}     //  88 
-    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  89 
-    , {doNamedChar, 78 /* N */, 14,0,  FALSE}     //  90 
-    , {doProperty, 112 /* p */, 14,0,  FALSE}     //  91 
-    , {doProperty, 80 /* P */, 14,0,  FALSE}     //  92 
-    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  93 
-    , {doBackslashS, 83 /* S */, 14,0,  TRUE}     //  94 
-    , {doBackslashs, 115 /* s */, 14,0,  TRUE}     //  95 
-    , {doBackslashW, 87 /* W */, 14,0,  TRUE}     //  96 
-    , {doBackslashw, 119 /* w */, 14,0,  TRUE}     //  97 
-    , {doBackslashX, 88 /* X */, 14,0,  TRUE}     //  98 
-    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  99 
-    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  100 
-    , {doBackRef, 128, 14,0,  TRUE}     //  101 
-    , {doEscapeError, 253, 183,0,  FALSE}     //  102 
-    , {doEscapedLiteralChar, 255, 14,0,  TRUE}     //  103 
-    , {doSetNegate, 94 /* ^ */, 107,0,  TRUE}     //  104      set-open
-    , {doSetPosixProp, 58 /* : */, 109,0,  FALSE}     //  105 
-    , {doNOP, 255, 107,0,  FALSE}     //  106 
-    , {doSetLiteral, 93 /* ] */, 122,0,  TRUE}     //  107      set-open2
-    , {doNOP, 255, 112,0,  FALSE}     //  108 
-    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  109      set-posix
-    , {doNOP, 58 /* : */, 112,0,  FALSE}     //  110 
-    , {doRuleError, 255, 183,0,  FALSE}     //  111 
-    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  112      set-start
-    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  113 
-    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  114 
-    , {doNOP, 45 /* - */, 118,0,  TRUE}     //  115 
-    , {doNOP, 38 /* & */, 120,0,  TRUE}     //  116 
-    , {doSetLiteral, 255, 122,0,  TRUE}     //  117 
-    , {doRuleError, 45 /* - */, 183,0,  FALSE}     //  118      set-start-dash
-    , {doSetAddDash, 255, 122,0,  FALSE}     //  119 
-    , {doRuleError, 38 /* & */, 183,0,  FALSE}     //  120      set-start-amp
-    , {doSetAddAmp, 255, 122,0,  FALSE}     //  121 
-    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  122      set-after-lit
-    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  123 
-    , {doNOP, 45 /* - */, 159,0,  TRUE}     //  124 
-    , {doNOP, 38 /* & */, 150,0,  TRUE}     //  125 
-    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  126 
-    , {doSetNoCloseError, 253, 183,0,  FALSE}     //  127 
-    , {doSetLiteral, 255, 122,0,  TRUE}     //  128 
-    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  129      set-after-set
-    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  130 
-    , {doNOP, 45 /* - */, 152,0,  TRUE}     //  131 
-    , {doNOP, 38 /* & */, 147,0,  TRUE}     //  132 
-    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  133 
-    , {doSetNoCloseError, 253, 183,0,  FALSE}     //  134 
-    , {doSetLiteral, 255, 122,0,  TRUE}     //  135 
-    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  136      set-after-range
-    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  137 
-    , {doNOP, 45 /* - */, 155,0,  TRUE}     //  138 
-    , {doNOP, 38 /* & */, 157,0,  TRUE}     //  139 
-    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  140 
-    , {doSetNoCloseError, 253, 183,0,  FALSE}     //  141 
-    , {doSetLiteral, 255, 122,0,  TRUE}     //  142 
-    , {doSetBeginUnion, 91 /* [ */, 104, 129, TRUE}     //  143      set-after-op
-    , {doSetOpError, 93 /* ] */, 183,0,  FALSE}     //  144 
-    , {doNOP, 92 /* \ */, 172,0,  TRUE}     //  145 
-    , {doSetLiteral, 255, 122,0,  TRUE}     //  146 
-    , {doSetBeginIntersection1, 91 /* [ */, 104, 129, TRUE}     //  147      set-set-amp
-    , {doSetIntersection2, 38 /* & */, 143,0,  TRUE}     //  148 
-    , {doSetAddAmp, 255, 122,0,  FALSE}     //  149 
-    , {doSetIntersection2, 38 /* & */, 143,0,  TRUE}     //  150      set-lit-amp
-    , {doSetAddAmp, 255, 122,0,  FALSE}     //  151 
-    , {doSetBeginDifference1, 91 /* [ */, 104, 129, TRUE}     //  152      set-set-dash
-    , {doSetDifference2, 45 /* - */, 143,0,  TRUE}     //  153 
-    , {doSetAddDash, 255, 122,0,  FALSE}     //  154 
-    , {doSetDifference2, 45 /* - */, 143,0,  TRUE}     //  155      set-range-dash
-    , {doSetAddDash, 255, 122,0,  FALSE}     //  156 
-    , {doSetIntersection2, 38 /* & */, 143,0,  TRUE}     //  157      set-range-amp
-    , {doSetAddAmp, 255, 122,0,  FALSE}     //  158 
-    , {doSetDifference2, 45 /* - */, 143,0,  TRUE}     //  159      set-lit-dash
-    , {doSetAddDash, 91 /* [ */, 122,0,  FALSE}     //  160 
-    , {doSetAddDash, 93 /* ] */, 122,0,  FALSE}     //  161 
-    , {doNOP, 92 /* \ */, 164,0,  TRUE}     //  162 
-    , {doSetRange, 255, 136,0,  TRUE}     //  163 
-    , {doSetOpError, 115 /* s */, 183,0,  FALSE}     //  164      set-lit-dash-escape
-    , {doSetOpError, 83 /* S */, 183,0,  FALSE}     //  165 
-    , {doSetOpError, 119 /* w */, 183,0,  FALSE}     //  166 
-    , {doSetOpError, 87 /* W */, 183,0,  FALSE}     //  167 
-    , {doSetOpError, 100 /* d */, 183,0,  FALSE}     //  168 
-    , {doSetOpError, 68 /* D */, 183,0,  FALSE}     //  169 
-    , {doSetNamedRange, 78 /* N */, 136,0,  FALSE}     //  170 
-    , {doSetRange, 255, 136,0,  TRUE}     //  171 
-    , {doSetProp, 112 /* p */, 129,0,  FALSE}     //  172      set-escape
-    , {doSetProp, 80 /* P */, 129,0,  FALSE}     //  173 
-    , {doSetNamedChar, 78 /* N */, 122,0,  FALSE}     //  174 
-    , {doSetBackslash_s, 115 /* s */, 136,0,  TRUE}     //  175 
-    , {doSetBackslash_S, 83 /* S */, 136,0,  TRUE}     //  176 
-    , {doSetBackslash_w, 119 /* w */, 136,0,  TRUE}     //  177 
-    , {doSetBackslash_W, 87 /* W */, 136,0,  TRUE}     //  178 
-    , {doSetBackslash_d, 100 /* d */, 136,0,  TRUE}     //  179 
-    , {doSetBackslash_D, 68 /* D */, 136,0,  TRUE}     //  180 
-    , {doSetLiteralEscaped, 255, 122,0,  TRUE}     //  181 
-    , {doSetFinish, 255, 14,0,  FALSE}     //  182      set-finish
-    , {doExit, 255, 183,0,  TRUE}     //  183      errorDeath
+    , {doBeginNamedCapture, 129, 64,0,  FALSE}     //  48 
+    , {doBadOpenParenType, 255, 197,0,  FALSE}     //  49 
+    , {doNOP, 41 /* ) */, 255,0,  TRUE}     //  50      paren-comment
+    , {doMismatchedParenErr, 253, 197,0,  FALSE}     //  51 
+    , {doNOP, 255, 50,0,  TRUE}     //  52 
+    , {doMatchMode, 105 /* i */, 53,0,  TRUE}     //  53      paren-flag
+    , {doMatchMode, 100 /* d */, 53,0,  TRUE}     //  54 
+    , {doMatchMode, 109 /* m */, 53,0,  TRUE}     //  55 
+    , {doMatchMode, 115 /* s */, 53,0,  TRUE}     //  56 
+    , {doMatchMode, 117 /* u */, 53,0,  TRUE}     //  57 
+    , {doMatchMode, 119 /* w */, 53,0,  TRUE}     //  58 
+    , {doMatchMode, 120 /* x */, 53,0,  TRUE}     //  59 
+    , {doMatchMode, 45 /* - */, 53,0,  TRUE}     //  60 
+    , {doSetMatchMode, 41 /* ) */, 2,0,  TRUE}     //  61 
+    , {doMatchModeParen, 58 /* : */, 2, 14, TRUE}     //  62 
+    , {doBadModeFlag, 255, 197,0,  FALSE}     //  63 
+    , {doContinueNamedCapture, 129, 64,0,  TRUE}     //  64      named-capture
+    , {doContinueNamedCapture, 128, 64,0,  TRUE}     //  65 
+    , {doOpenCaptureParen, 62 /* > */, 2, 14, TRUE}     //  66 
+    , {doBadNamedCapture, 255, 197,0,  FALSE}     //  67 
+    , {doNGStar, 63 /* ? */, 20,0,  TRUE}     //  68      quant-star
+    , {doPossessiveStar, 43 /* + */, 20,0,  TRUE}     //  69 
+    , {doStar, 255, 20,0,  FALSE}     //  70 
+    , {doNGPlus, 63 /* ? */, 20,0,  TRUE}     //  71      quant-plus
+    , {doPossessivePlus, 43 /* + */, 20,0,  TRUE}     //  72 
+    , {doPlus, 255, 20,0,  FALSE}     //  73 
+    , {doNGOpt, 63 /* ? */, 20,0,  TRUE}     //  74      quant-opt
+    , {doPossessiveOpt, 43 /* + */, 20,0,  TRUE}     //  75 
+    , {doOpt, 255, 20,0,  FALSE}     //  76 
+    , {doNOP, 128, 79,0,  FALSE}     //  77      interval-open
+    , {doIntervalError, 255, 197,0,  FALSE}     //  78 
+    , {doIntevalLowerDigit, 128, 79,0,  TRUE}     //  79      interval-lower
+    , {doNOP, 44 /* , */, 83,0,  TRUE}     //  80 
+    , {doIntervalSame, 125 /* } */, 86,0,  TRUE}     //  81 
+    , {doIntervalError, 255, 197,0,  FALSE}     //  82 
+    , {doIntervalUpperDigit, 128, 83,0,  TRUE}     //  83      interval-upper
+    , {doNOP, 125 /* } */, 86,0,  TRUE}     //  84 
+    , {doIntervalError, 255, 197,0,  FALSE}     //  85 
+    , {doNGInterval, 63 /* ? */, 20,0,  TRUE}     //  86      interval-type
+    , {doPossessiveInterval, 43 /* + */, 20,0,  TRUE}     //  87 
+    , {doInterval, 255, 20,0,  FALSE}     //  88 
+    , {doBackslashA, 65 /* A */, 2,0,  TRUE}     //  89      backslash
+    , {doBackslashB, 66 /* B */, 2,0,  TRUE}     //  90 
+    , {doBackslashb, 98 /* b */, 2,0,  TRUE}     //  91 
+    , {doBackslashd, 100 /* d */, 14,0,  TRUE}     //  92 
+    , {doBackslashD, 68 /* D */, 14,0,  TRUE}     //  93 
+    , {doBackslashG, 71 /* G */, 2,0,  TRUE}     //  94 
+    , {doNOP, 107 /* k */, 110,0,  TRUE}     //  95 
+    , {doNamedChar, 78 /* N */, 14,0,  FALSE}     //  96 
+    , {doProperty, 112 /* p */, 14,0,  FALSE}     //  97 
+    , {doProperty, 80 /* P */, 14,0,  FALSE}     //  98 
+    , {doEnterQuoteMode, 81 /* Q */, 2,0,  TRUE}     //  99 
+    , {doBackslashS, 83 /* S */, 14,0,  TRUE}     //  100 
+    , {doBackslashs, 115 /* s */, 14,0,  TRUE}     //  101 
+    , {doBackslashW, 87 /* W */, 14,0,  TRUE}     //  102 
+    , {doBackslashw, 119 /* w */, 14,0,  TRUE}     //  103 
+    , {doBackslashX, 88 /* X */, 14,0,  TRUE}     //  104 
+    , {doBackslashZ, 90 /* Z */, 2,0,  TRUE}     //  105 
+    , {doBackslashz, 122 /* z */, 2,0,  TRUE}     //  106 
+    , {doBackRef, 128, 14,0,  TRUE}     //  107 
+    , {doEscapeError, 253, 197,0,  FALSE}     //  108 
+    , {doEscapedLiteralChar, 255, 14,0,  TRUE}     //  109 
+    , {doBeginNamedBackRef, 60 /* < */, 112,0,  TRUE}     //  110      named-backref
+    , {doBadNamedCapture, 255, 197,0,  FALSE}     //  111 
+    , {doContinueNamedBackRef, 129, 114,0,  TRUE}     //  112      named-backref-2
+    , {doBadNamedCapture, 255, 197,0,  FALSE}     //  113 
+    , {doContinueNamedBackRef, 129, 114,0,  TRUE}     //  114      named-backref-3
+    , {doContinueNamedBackRef, 128, 114,0,  TRUE}     //  115 
+    , {doCompleteNamedBackRef, 62 /* > */, 14,0,  TRUE}     //  116 
+    , {doBadNamedCapture, 255, 197,0,  FALSE}     //  117 
+    , {doSetNegate, 94 /* ^ */, 121,0,  TRUE}     //  118      set-open
+    , {doSetPosixProp, 58 /* : */, 123,0,  FALSE}     //  119 
+    , {doNOP, 255, 121,0,  FALSE}     //  120 
+    , {doSetLiteral, 93 /* ] */, 136,0,  TRUE}     //  121      set-open2
+    , {doNOP, 255, 126,0,  FALSE}     //  122 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  123      set-posix
+    , {doNOP, 58 /* : */, 126,0,  FALSE}     //  124 
+    , {doRuleError, 255, 197,0,  FALSE}     //  125 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  126      set-start
+    , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE}     //  127 
+    , {doNOP, 92 /* \ */, 186,0,  TRUE}     //  128 
+    , {doNOP, 45 /* - */, 132,0,  TRUE}     //  129 
+    , {doNOP, 38 /* & */, 134,0,  TRUE}     //  130 
+    , {doSetLiteral, 255, 136,0,  TRUE}     //  131 
+    , {doRuleError, 45 /* - */, 197,0,  FALSE}     //  132      set-start-dash
+    , {doSetAddDash, 255, 136,0,  FALSE}     //  133 
+    , {doRuleError, 38 /* & */, 197,0,  FALSE}     //  134      set-start-amp
+    , {doSetAddAmp, 255, 136,0,  FALSE}     //  135 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  136      set-after-lit
+    , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE}     //  137 
+    , {doNOP, 45 /* - */, 173,0,  TRUE}     //  138 
+    , {doNOP, 38 /* & */, 164,0,  TRUE}     //  139 
+    , {doNOP, 92 /* \ */, 186,0,  TRUE}     //  140 
+    , {doSetNoCloseError, 253, 197,0,  FALSE}     //  141 
+    , {doSetLiteral, 255, 136,0,  TRUE}     //  142 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  143      set-after-set
+    , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE}     //  144 
+    , {doNOP, 45 /* - */, 166,0,  TRUE}     //  145 
+    , {doNOP, 38 /* & */, 161,0,  TRUE}     //  146 
+    , {doNOP, 92 /* \ */, 186,0,  TRUE}     //  147 
+    , {doSetNoCloseError, 253, 197,0,  FALSE}     //  148 
+    , {doSetLiteral, 255, 136,0,  TRUE}     //  149 
+    , {doSetEnd, 93 /* ] */, 255,0,  TRUE}     //  150      set-after-range
+    , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE}     //  151 
+    , {doNOP, 45 /* - */, 169,0,  TRUE}     //  152 
+    , {doNOP, 38 /* & */, 171,0,  TRUE}     //  153 
+    , {doNOP, 92 /* \ */, 186,0,  TRUE}     //  154 
+    , {doSetNoCloseError, 253, 197,0,  FALSE}     //  155 
+    , {doSetLiteral, 255, 136,0,  TRUE}     //  156 
+    , {doSetBeginUnion, 91 /* [ */, 118, 143, TRUE}     //  157      set-after-op
+    , {doSetOpError, 93 /* ] */, 197,0,  FALSE}     //  158 
+    , {doNOP, 92 /* \ */, 186,0,  TRUE}     //  159 
+    , {doSetLiteral, 255, 136,0,  TRUE}     //  160 
+    , {doSetBeginIntersection1, 91 /* [ */, 118, 143, TRUE}     //  161      set-set-amp
+    , {doSetIntersection2, 38 /* & */, 157,0,  TRUE}     //  162 
+    , {doSetAddAmp, 255, 136,0,  FALSE}     //  163 
+    , {doSetIntersection2, 38 /* & */, 157,0,  TRUE}     //  164      set-lit-amp
+    , {doSetAddAmp, 255, 136,0,  FALSE}     //  165 
+    , {doSetBeginDifference1, 91 /* [ */, 118, 143, TRUE}     //  166      set-set-dash
+    , {doSetDifference2, 45 /* - */, 157,0,  TRUE}     //  167 
+    , {doSetAddDash, 255, 136,0,  FALSE}     //  168 
+    , {doSetDifference2, 45 /* - */, 157,0,  TRUE}     //  169      set-range-dash
+    , {doSetAddDash, 255, 136,0,  FALSE}     //  170 
+    , {doSetIntersection2, 38 /* & */, 157,0,  TRUE}     //  171      set-range-amp
+    , {doSetAddAmp, 255, 136,0,  FALSE}     //  172 
+    , {doSetDifference2, 45 /* - */, 157,0,  TRUE}     //  173      set-lit-dash
+    , {doSetAddDash, 91 /* [ */, 136,0,  FALSE}     //  174 
+    , {doSetAddDash, 93 /* ] */, 136,0,  FALSE}     //  175 
+    , {doNOP, 92 /* \ */, 178,0,  TRUE}     //  176 
+    , {doSetRange, 255, 150,0,  TRUE}     //  177 
+    , {doSetOpError, 115 /* s */, 197,0,  FALSE}     //  178      set-lit-dash-escape
+    , {doSetOpError, 83 /* S */, 197,0,  FALSE}     //  179 
+    , {doSetOpError, 119 /* w */, 197,0,  FALSE}     //  180 
+    , {doSetOpError, 87 /* W */, 197,0,  FALSE}     //  181 
+    , {doSetOpError, 100 /* d */, 197,0,  FALSE}     //  182 
+    , {doSetOpError, 68 /* D */, 197,0,  FALSE}     //  183 
+    , {doSetNamedRange, 78 /* N */, 150,0,  FALSE}     //  184 
+    , {doSetRange, 255, 150,0,  TRUE}     //  185 
+    , {doSetProp, 112 /* p */, 143,0,  FALSE}     //  186      set-escape
+    , {doSetProp, 80 /* P */, 143,0,  FALSE}     //  187 
+    , {doSetNamedChar, 78 /* N */, 136,0,  FALSE}     //  188 
+    , {doSetBackslash_s, 115 /* s */, 150,0,  TRUE}     //  189 
+    , {doSetBackslash_S, 83 /* S */, 150,0,  TRUE}     //  190 
+    , {doSetBackslash_w, 119 /* w */, 150,0,  TRUE}     //  191 
+    , {doSetBackslash_W, 87 /* W */, 150,0,  TRUE}     //  192 
+    , {doSetBackslash_d, 100 /* d */, 150,0,  TRUE}     //  193 
+    , {doSetBackslash_D, 68 /* D */, 150,0,  TRUE}     //  194 
+    , {doSetLiteralEscaped, 255, 136,0,  TRUE}     //  195 
+    , {doSetFinish, 255, 14,0,  FALSE}     //  196      set-finish
+    , {doExit, 255, 197,0,  TRUE}     //  197      errorDeath
   };
  static const char * const RegexStateNames[] = {    0,
       "start",
@@ -362,6 +383,7 @@ static const char * const RegexStateNames[] = {    0,
      0,
       "open-paren-lookbehind",
      0,
+    0,
      0,
       "paren-comment",
      0,
@@ -376,6 +398,10 @@ static const char * const RegexStateNames[] = {    0,
      0,
      0,
      0,
+    0,
+     "named-capture",
+    0,
+    0,
      0,
       "quant-star",
      0,
@@ -417,6 +443,15 @@ static const char * const RegexStateNames[] = {    0,
      0,
      0,
      0,
+    0,
+    0,
+     "named-backref",
+    0,
+     "named-backref-2",
+    0,
+     "named-backref-3",
+    0,
+    0,
      0,
       "set-open",
      0,
diff --git a/icu4c/source/i18n/regexcst.txt b/icu4c/source/i18n/regexcst.txt

index 77ebd9606b47356a5535e38a0afc84c992ff5e8f..fe9bc6e74cbbcd5e1c17f2222f194e64e459523d 100644 (file)
--- a/icu4c/source/i18n/regexcst.txt
+++ b/icu4c/source/i18n/regexcst.txt
@@ -1,7 +1,7 @@
  
  #*****************************************************************************
  #
-#   Copyright (C) 2002-2007, International Business Machines Corporation and others.
+#   Copyright (C) 2002-2015, International Business Machines Corporation and others.
  #   All Rights Reserved.
  #
  #*****************************************************************************
@@ -147,6 +147,7 @@ open-paren-extended:
  open-paren-lookbehind:
      '='                  n  term            ^expr-cont              doOpenLookBehind       #  (?<=
      '!'                  n  term            ^expr-cont              doOpenLookBehindNeg    #  (?<!
+    ascii_letter            named-capture                           doBeginNamedCapture    #  (?<name
      default                 errorDeath                              doBadOpenParenType
  
  
@@ -174,6 +175,14 @@ paren-flag:
      ':'                  n  term              ^expr-quant           doMatchModeParen
      default                 errorDeath                              doBadModeFlag
  
+#
+#  named-capture    (?<name> ... ), position currently on the name.
+#
+named-capture:
+    ascii_letter         n  named-capture                           doContinueNamedCapture
+    digit_char           n  named-capture                           doContinueNamedCapture
+    '>'                  n  term               ^expr-quant          doOpenCaptureParen      # common w non-named capture.
+    default                 errorDeath                              doBadNamedCapture
  
  #
  #  quant-star     Scanning a '*' quantifier.  Need to look ahead to decide
@@ -241,6 +250,7 @@ backslash:
     'd'                   n  expr-quant                              doBackslashd
     'D'                   n  expr-quant                              doBackslashD
     'G'                   n  term                                    doBackslashG
+   'k'                   n  named-backref
     'N'                      expr-quant                              doNamedChar      #   \N{NAME}  named char
     'p'                      expr-quant                              doProperty       #   \p{Lu}  style property
     'P'                      expr-quant                              doProperty
@@ -257,6 +267,24 @@ backslash:
     default               n  expr-quant                              doEscapedLiteralChar
  
  
+# named-backref   Scanned \k
+#                 Leading to \k<captureName>
+#                 Failure to get the full sequence is an error.
+#
+named-backref:
+    '<'                  n  named-backref-2                         doBeginNamedBackRef
+    default                 errorDeath                              doBadNamedCapture
+
+named-backref-2:
+    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
+    default                 errorDeath                              doBadNamedCapture
+
+named-backref-3:
+    ascii_letter         n  named-backref-3                         doContinueNamedBackRef
+    digit_char           n  named-backref-3                         doContinueNamedBackRef
+    '>'                  n  expr-quant                              doCompleteNamedBackRef
+    default                 errorDeath                              doBadNamedCapture
+
  
  #
  # [set expression] parsing,
diff --git a/icu4c/source/i18n/regexst.cpp b/icu4c/source/i18n/regexst.cpp

index fa61c3de401ead00bc3992f4435057cebdd6e0c2..a8cbf23f86b9f02d98c57d17044920dca9a529a9 100644 (file)
--- a/icu4c/source/i18n/regexst.cpp
+++ b/icu4c/source/i18n/regexst.cpp
@@ -1,7 +1,7 @@
  //
  //  regexst.h
  //
-//  Copyright (C) 2004-2013, International Business Machines Corporation and others.
+//  Copyright (C) 2004-2015, International Business Machines Corporation and others.
  //  All Rights Reserved.
  //
  //  This file contains class RegexStaticSets
@@ -55,11 +55,6 @@ static const UChar gRuleSet_rule_char_pattern[]       = {
   //   \     {    \     }     \     ^     \     $     \     |     \     \     \     .     ]
      0x5c, 0x7b,0x5c, 0x7d, 0x5c, 0x5e, 0x5c, 0x24, 0x5c, 0x7c, 0x5c, 0x5c, 0x5c, 0x2e, 0x5d, 0};
  
-
-static const UChar gRuleSet_digit_char_pattern[] = {
-//    [    0      -    9     ]
-    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
-
  //
  //   Here are the backslash escape characters that ICU's unescape() function
  //    will handle.
@@ -213,23 +208,29 @@ fEmptyText(NULL)
  
      // Sets used while parsing rules, but not referenced from the parse state table
      fRuleSets[kRuleSet_rule_char-128]   = UnicodeSet(UnicodeString(TRUE, gRuleSet_rule_char_pattern, -1),   *status);
-    fRuleSets[kRuleSet_digit_char-128]  = UnicodeSet(UnicodeString(TRUE, gRuleSet_digit_char_pattern, -1),  *status);
+    fRuleSets[kRuleSet_digit_char-128].add((UChar)0x30, (UChar)0x39);    // [0-9]
+    fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x41, (UChar)0x5A);  // [A-Z]
+    fRuleSets[kRuleSet_ascii_letter-128].add((UChar)0x61, (UChar)0x7A);  // [a-z]
      fRuleDigitsAlias = &fRuleSets[kRuleSet_digit_char-128];
-    for (i=0; i<(int32_t)(sizeof(fRuleSets)/sizeof(fRuleSets[0])); i++) {
+    for (i=0; i<UPRV_LENGTHOF(fRuleSets); i++) {
          fRuleSets[i].compact();
      }
      
      // Finally, initialize an empty string for utility purposes
      fEmptyText = utext_openUChars(NULL, NULL, 0, status);
      
-    return; // If we reached this point, everything is fine so just exit
+    if (U_SUCCESS(*status)) {
+        return;
+    }
  
  ExitConstrDeleteAll: // Remove fPropSets and fRuleSets and return error
      for (i=0; i<URX_LAST_SET; i++) {
          delete fPropSets[i];
          fPropSets[i] = NULL;
      }
-    *status = U_MEMORY_ALLOCATION_ERROR;
+    if (U_SUCCESS(*status)) {
+        *status = U_MEMORY_ALLOCATION_ERROR;
+    }
  }
  
  
diff --git a/icu4c/source/i18n/rematch.cpp b/icu4c/source/i18n/rematch.cpp

index 661334245b2f65abcd8fc0341997f77c5167b864..41330332a5068fbb55fc6fa6f3bbf1b8ea7667c2 100644 (file)
--- a/icu4c/source/i18n/rematch.cpp
+++ b/icu4c/source/i18n/rematch.cpp
@@ -257,6 +257,9 @@ void RegexMatcher::init2(UText *input, UErrorCode &status) {
  
  static const UChar BACKSLASH  = 0x5c;
  static const UChar DOLLARSIGN = 0x24;
+static const UChar LEFTBRACKET = 0x7b;
+static const UChar RIGHTBRACKET = 0x7d;
+
  //--------------------------------------------------------------------------------
  //
  //    appendReplacement
@@ -331,8 +334,7 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
      //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
      //         move entire ranges not containing substitutions.
      UTEXT_SETNATIVEINDEX(replacement, 0);
-    UChar32 c = UTEXT_NEXT32(replacement);
-    while (c != U_SENTINEL) {
+    for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
          if (c == BACKSLASH) {
              // Backslash Escape.  Copy the following char out without further checks.
              //                    Note:  Surrogate pairs don't need any special handling
@@ -398,51 +400,69 @@ RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
                  }
              }
          } else {
-            // We've got a $.  Pick up a capture group number if one follows.
-            // Consume at most the number of digits necessary for the largest capture
-            // number that is valid for this pattern.
+            // We've got a $.  Pick up a capture group name or number if one follows.
+            // Consume digits so long as the resulting group number <= the number of
+            // number of capture groups in the pattern.
  
-            int32_t numDigits = 0;
              int32_t groupNum  = 0;
-            UChar32 digitC;
-            for (;;) {
-                digitC = UTEXT_CURRENT32(replacement);
-                if (digitC == U_SENTINEL) {
-                    break;
-                }
-                if (u_isdigit(digitC) == FALSE) {
-                    break;
+            int32_t numDigits = 0;
+            UChar32 nextChar = utext_current32(replacement);
+            if (nextChar == LEFTBRACKET) {
+                // Scan for a Named Capture Group, ${name}.
+                UnicodeString groupName;
+                utext_next32(replacement);
+                while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
+                    nextChar = utext_next32(replacement);
+                    if (nextChar == U_SENTINEL) {
+                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                    } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
+                               (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
+                               (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
+                        groupName.append(nextChar);
+                    } else if (nextChar == RIGHTBRACKET) {
+                        groupNum = uhash_geti(fPattern->fNamedCaptureMap, &groupName);
+                        if (groupNum == 0) {
+                            status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                        }
+                    } else {
+                        // Character was something other than a name char or a closing '}'
+                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                    }
                  }
-                (void)UTEXT_NEXT32(replacement);
-                groupNum=groupNum*10 + u_charDigitValue(digitC);
-                numDigits++;
-                if (numDigits >= fPattern->fMaxCaptureDigits) {
-                    break;
+                        
+            } else if (u_isdigit(nextChar)) {
+                // $n    Scan for a capture group number
+                int32_t numCaptureGroups = fPattern->fGroupMap->size();
+                for (;;) {
+                    nextChar = UTEXT_CURRENT32(replacement);
+                    if (nextChar == U_SENTINEL) {
+                        break;
+                    }
+                    if (u_isdigit(nextChar) == FALSE) {
+                        break;
+                    }
+                    int32_t nextDigitVal = u_charDigitValue(nextChar);
+                    if (groupNum*10 + nextDigitVal > numCaptureGroups) {
+                        // Don't consume the next digit if it makes the capture group number too big.
+                        if (numDigits == 0) {
+                            status = U_INDEX_OUTOFBOUNDS_ERROR;
+                        }
+                        break;
+                    }
+                    (void)UTEXT_NEXT32(replacement);
+                    groupNum=groupNum*10 + nextDigitVal; 
+                    ++numDigits;
                  }
+            } else {
+                // $ not followed by capture group name or number.
+                status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
              }
  
-
-            if (numDigits == 0) {
-                // The $ didn't introduce a group number at all.
-                // Treat it as just part of the substitution text.
-                UChar c16 = DOLLARSIGN;
-                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
-            } else {
-                // Finally, append the capture group data to the destination.
+            if (U_SUCCESS(status)) {
                  destLen += appendGroup(groupNum, dest, status);
-                if (U_FAILURE(status)) {
-                    // Can fail if group number is out of range.
-                    break;
-                }
              }
-        }
-
-        if (U_FAILURE(status)) {
-            break;
-        } else {
-            c = UTEXT_NEXT32(replacement);
-        }
-    }
+        }  // End of $ capture group handling
+    }  // End of per-character loop through the replacement string.
  
      return *this;
  }
@@ -1201,7 +1221,6 @@ UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
  }
  
  
-
  //--------------------------------------------------------------------------------
  //
  //  appendGroup() -- currently internal only, appends a group to a UText rather
@@ -1282,8 +1301,6 @@ int32_t RegexMatcher::groupCount() const {
      return fPattern->fGroupMap->size();
  }
  
-
-
  //--------------------------------------------------------------------------------
  //
  //  hasAnchoringBounds()
diff --git a/icu4c/source/i18n/repattrn.cpp b/icu4c/source/i18n/repattrn.cpp

index 7efd4cb8932cb60d5d2827748717bd2a4f98e6e5..14454e25f8faeb930cffb280d5e52f0c7c81f909 100644 (file)
--- a/icu4c/source/i18n/repattrn.cpp
+++ b/icu4c/source/i18n/repattrn.cpp
@@ -3,7 +3,7 @@
  //
  /*
  ***************************************************************************
-*   Copyright (C) 2002-2014 International Business Machines Corporation   *
+*   Copyright (C) 2002-2015 International Business Machines Corporation   *
  *   and others. All rights reserved.                                      *
  ***************************************************************************
  */
@@ -15,6 +15,7 @@
  #include "unicode/regex.h"
  #include "unicode/uclean.h"
  #include "uassert.h"
+#include "uhash.h"
  #include "uvector.h"
  #include "uvectr32.h"
  #include "uvectr64.h"
@@ -92,7 +93,6 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
      fMinMatchLen      = other.fMinMatchLen;
      fFrameSize        = other.fFrameSize;
      fDataSize         = other.fDataSize;
-    fMaxCaptureDigits = other.fMaxCaptureDigits;
      fStaticSets       = other.fStaticSets;
      fStaticSets8      = other.fStaticSets8;
  
@@ -133,6 +133,21 @@ RegexPattern &RegexPattern::operator = (const RegexPattern &other) {
          fSets8[i] = other.fSets8[i];
      }
  
+    // Copy the named capture group hash map.
+    int32_t hashPos = UHASH_FIRST;
+    while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) {
+        if (U_FAILURE(fDeferredStatus)) {
+            break;
+        }
+        const UnicodeString *name = (const UnicodeString *)hashEl->key.pointer;
+        UnicodeString *key = new UnicodeString(*name);
+        int32_t val = hashEl->value.integer;
+        if (key == NULL) {
+            fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
+        } else {
+            uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus);
+        }
+    }
      return *this;
  }
  
@@ -154,7 +169,6 @@ void RegexPattern::init() {
      fFrameSize        = 0;
      fDataSize         = 0;
      fGroupMap         = NULL;
-    fMaxCaptureDigits = 1;
      fStaticSets       = NULL;
      fStaticSets8      = NULL;
      fStartType        = START_NO_INFO;
@@ -164,6 +178,7 @@ void RegexPattern::init() {
      fInitialChar      = 0;
      fInitialChars8    = NULL;
      fNeedsAltInput    = FALSE;
+    fNamedCaptureMap  = NULL;
  
      fPattern          = NULL; // will be set later
      fPatternString    = NULL; // may be set later
@@ -172,17 +187,24 @@ void RegexPattern::init() {
      fSets             = new UVector(fDeferredStatus);
      fInitialChars     = new UnicodeSet;
      fInitialChars8    = new Regex8BitSet;
+    fNamedCaptureMap  = uhash_open(uhash_hashUnicodeString,     // Key hash function
+                                   uhash_compareUnicodeString,  // Key comparator function
+                                   uhash_compareLong,           // Value comparator function
+                                   &fDeferredStatus);
      if (U_FAILURE(fDeferredStatus)) {
          return;
      }
      if (fCompiledPat == NULL  || fGroupMap == NULL || fSets == NULL ||
-        fInitialChars == NULL || fInitialChars8 == NULL) {
+            fInitialChars == NULL || fInitialChars8 == NULL || fNamedCaptureMap == NULL) {
          fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
          return;
      }
  
      // Slot zero of the vector of sets is reserved.  Fill it here.
      fSets->addElement((int32_t)0, fDeferredStatus);
+
+    // fNamedCaptureMap owns its key strings, type (UnicodeString *)
+    uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject);
  }
  
  
@@ -220,6 +242,8 @@ void RegexPattern::zap() {
          delete fPatternString;
          fPatternString = NULL;
      }
+    uhash_close(fNamedCaptureMap);
+    fNamedCaptureMap = NULL;
  }
  
  
@@ -577,6 +601,34 @@ UText *RegexPattern::patternText(UErrorCode      &status) const {
  }
  
  
+//--------------------------------------------------------------------------------
+//
+//  groupNumberFromName()
+//
+//--------------------------------------------------------------------------------
+int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+
+    // No need to explicitly check for syntactically valid names.
+    // Invalid ones will never be in the map, and the lookup will fail.
+
+    int32_t number = uhash_geti(fNamedCaptureMap, &groupName);
+    if (number == 0) {
+        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+    }
+    return number;
+}
+
+int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const {
+    if (U_FAILURE(status)) {
+        return 0;
+    }
+    UnicodeString name(groupName, nameLength, US_INV);
+    return groupNumberFromName(name, status);
+}
+
  
  //---------------------------------------------------------------------
  //
@@ -754,6 +806,7 @@ void   RegexPattern::dumpOp(int32_t index) const {
  
  void RegexPattern::dumpPattern() const {
  #if defined(REGEX_DEBUG)
+    // TODO: This function assumes an ASCII based charset.
      int      index;
      int      i;
  
@@ -805,6 +858,21 @@ void RegexPattern::dumpPattern() const {
              }
      }
  
+    printf("Named Capture Groups:\n");
+    if (uhash_count(fNamedCaptureMap) == 0) {
+        printf("   None\n");
+    } else {
+        int32_t pos = UHASH_FIRST;
+        const UHashElement *el = NULL;
+        while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) {
+            const UnicodeString *name = (const UnicodeString *)el->key.pointer;
+            char s[100];
+            name->extract(0, 99, s, sizeof(s), US_INV);  // capture group names are invariant.
+            int32_t number = el->value.integer;
+            printf("   %d\t%s\n", number, s);
+        }
+    }
+
      printf("\nIndex   Binary     Type             Operand\n" \
             "-------------------------------------------\n");
      for (index = 0; index<fCompiledPat->size(); index++) {
diff --git a/icu4c/source/i18n/unicode/regex.h b/icu4c/source/i18n/unicode/regex.h

index ca5f1a9732bcb4cec09b84b981688f0ea4f266d0..90478e461facafbc261f515dc0f4a47b0ac15cd8 100644 (file)
--- a/icu4c/source/i18n/unicode/regex.h
+++ b/icu4c/source/i18n/unicode/regex.h
@@ -55,6 +55,8 @@
  
  // Forward Declarations
  
+struct UHashtable;
+
  U_NAMESPACE_BEGIN
  
  struct Regex8BitSet;
@@ -136,7 +138,7 @@ public:
  
      /**
       * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
-     * intended to be subclasses, <code>clone()</code> and the copy construction are
+     * intended to be subclassed, <code>clone()</code> and the copy construction are
       * equivalent operations.
       * @return the copy of this RegexPattern
       * @stable ICU 2.4
@@ -437,6 +439,41 @@ public:
      virtual UText *patternText(UErrorCode      &status) const;
  
  
+    /**
+     * Get the group number corresponding to a named capture group.
+     * The returned number can be used with any function that access
+     * capture groups by number.
+     *
+     * The function returns an error status if the specified name does not
+     * appear in the pattern.
+     *
+     * @param  groupName   The capture group name.
+     * @param  status      A UErrorCode to receive any errors.
+     *
+     * @draft ICU 55
+     */
+    virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
+
+
+    /**
+     * Get the group number corresponding to a named capture group.
+     * The returned number can be used with any function that access
+     * capture groups by number.
+     *
+     * The function returns an error status if the specified name does not
+     * appear in the pattern.
+     *
+     * @param  groupName   The capture group name,
+     *                     platform invariant characters only.
+     * @param  nameLength  The length of the name, or -1 if the name is
+     *                     nul-terminated.
+     * @param  status      A UErrorCode to receive any errors.
+     *
+     * @draft ICU 55
+     */
+    virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
+
+
      /**
       * Split a string into fields.  Somewhat like split() from Perl or Java.
       * Pattern matches identify delimiters that separate the input
@@ -573,8 +610,6 @@ private:
      UVector32       *fGroupMap;    // Map from capture group number to position of
                                     //   the group's variables in the matcher stack frame.
  
-    int32_t         fMaxCaptureDigits;
-
      UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
                                     //   regex character classes, e.g. Word.
  
@@ -589,6 +624,8 @@ private:
      Regex8BitSet   *fInitialChars8;
      UBool           fNeedsAltInput;
  
+    UHashtable     *fNamedCaptureMap;  // Map from capture group names to numbers.
+
      friend class RegexCompile;
      friend class RegexMatcher;
      friend class RegexCImpl;
@@ -854,7 +891,6 @@ public:
      */
      virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
  
-
     /**
      *   Returns the number of capturing groups in this matcher's pattern.
      *   @return the number of capture groups
@@ -945,7 +981,6 @@ public:
      */
      virtual int64_t start64(int32_t group, UErrorCode &status) const;
  
-
     /**
      *    Returns the index in the input string of the first character following the
      *    text matched during the previous match operation.
@@ -1015,7 +1050,6 @@ public:
      */
      virtual int64_t end64(int32_t group, UErrorCode &status) const;
  
-
     /**
      *   Resets this matcher.  The effect is to remove any memory of previous matches,
      *       and to cause subsequent find() operations to begin at the beginning of
diff --git a/icu4c/source/i18n/unicode/uregex.h b/icu4c/source/i18n/unicode/uregex.h

index cb7e08d82ad7257b7498c3af8b31f84656169a8c..be32ed94a0d53b2c4a6f2b892609d967480efd95 100644 (file)
--- a/icu4c/source/i18n/unicode/uregex.h
+++ b/icu4c/source/i18n/unicode/uregex.h
@@ -607,6 +607,53 @@ U_STABLE int32_t U_EXPORT2
  uregex_groupCount(URegularExpression *regexp,
                    UErrorCode         *status);
  
+/**
+  * Get the group number corresponding to a named capture group.
+  * The returned number can be used with any function that access
+  * capture groups by number.
+  *
+  * The function returns an error status if the specified name does not
+  * appear in the pattern.
+  *
+  * @param  regexp      The compiled regular expression.
+  * @param  groupName   The capture group name.
+  * @param  nameLength  The length of the name, or -1 if the name is a
+  *                     nul-terminated string.
+  * @param  status      A pointer to a UErrorCode to receive any errors.
+  *
+  * @draft ICU 55
+  */
+U_DRAFT int32_t U_EXPORT2
+uregex_groupNumberFromName(URegularExpression *regexp,
+                           const UChar        *groupName,
+                           int32_t             nameLength,
+                           UErrorCode          *status);
+
+
+/**
+  * Get the group number corresponding to a named capture group.
+  * The returned number can be used with any function that access
+  * capture groups by number.
+  *
+  * The function returns an error status if the specified name does not
+  * appear in the pattern.
+  *
+  * @param  regexp      The compiled regular expression.
+  * @param  groupName   The capture group name,
+  *                     platform invariant characters only.
+  * @param  nameLength  The length of the name, or -1 if the name is
+  *                     nul-terminated.
+  * @param  status      A pointer to a UErrorCode to receive any errors.
+  *
+  * @draft ICU 55
+  */
+U_DRAFT int32_t U_EXPORT2
+uregex_groupNumberFromCName(URegularExpression *regexp,
+                            const char         *groupName,
+                            int32_t             nameLength,
+                            UErrorCode          *status);
+
+
  /** Extract the string for the specified matching expression or subexpression.
    * Group #0 is the complete string of matched text.
    * Group #1 is the text matched by the first set of capturing parentheses.
@@ -630,8 +677,8 @@ uregex_group(URegularExpression *regexp,
               int32_t             destCapacity,
               UErrorCode          *status);
  
-/** Returns a shallow immutable clone of the entire input string.  The returned UText current native index
-  *   is set to the beginning of the requested capture group.  The capture group length is also
+/** Returns a shallow immutable clone of the entire input string with the current index set
+  *   to the beginning of the requested capture group.  The capture group length is also
    *   returned via groupLength.
    * Group #0 is the complete string of matched text.
    * Group #1 is the text matched by the first set of capturing parentheses.
@@ -644,7 +691,7 @@ uregex_group(URegularExpression *regexp,
    *   @param   dest         A mutable UText in which to store the current input.
    *                         If NULL, a new UText will be created as an immutable shallow clone
    *                         of the entire input string.
-  *   @param   groupLength  The group length of the desired capture group.
+  *   @param   groupLength  The group length of the desired capture group. Output parameter.
    *   @param   status       A reference to a UErrorCode to receive any errors.
    *   @return               The subject text currently associated with this regular expression.
    *                         If a pre-allocated UText was provided, it will always be used and returned.
diff --git a/icu4c/source/i18n/uregex.cpp b/icu4c/source/i18n/uregex.cpp

index 1f110f2c7b4cf6288d3be8aafd8f21a475e2a6b9..99e94283816cd2f6eec8848dea657fa4198ec52d 100644 (file)
--- a/icu4c/source/i18n/uregex.cpp
+++ b/icu4c/source/i18n/uregex.cpp
@@ -17,14 +17,14 @@
  #include "unicode/uchar.h"
  #include "unicode/uobject.h"
  #include "unicode/utf16.h"
-#include "umutex.h"
-#include "uassert.h"
  #include "cmemory.h"
+#include "uassert.h"
+#include "uhash.h"
+#include "umutex.h"
+#include "uvectr32.h"
  
  #include "regextxt.h"
  
-#include <stdio.h>
-
  U_NAMESPACE_BEGIN
  
  #define REMAINING_CAPACITY(idx,len) ((((len)-(idx))>0)?((len)-(idx)):0)
@@ -625,6 +625,36 @@ uregex_groupCount(URegularExpression *regexp2,
  }
  
  
+//------------------------------------------------------------------------------
+//
+//    uregex_groupNumberFromName
+//
+//------------------------------------------------------------------------------
+int32_t
+uregex_groupNumberFromName(URegularExpression *regexp2,
+                           const UChar        *groupName,
+                           int32_t             nameLength,
+                           UErrorCode          *status) {
+    RegularExpression *regexp = (RegularExpression*)regexp2;
+    if (validateRE(regexp, FALSE, status) == FALSE) {
+        return 0;
+    }
+    int32_t  result = regexp->fPat->groupNumberFromName(UnicodeString(groupName, nameLength), *status);
+    return result;
+}
+
+int32_t
+uregex_groupNumberFromCName(URegularExpression *regexp2,
+                            const char         *groupName,
+                            int32_t             nameLength,
+                            UErrorCode          *status) {
+    RegularExpression *regexp = (RegularExpression*)regexp2;
+    if (validateRE(regexp, FALSE, status) == FALSE) {
+        return 0;
+    }
+    return regexp->fPat->groupNumberFromName(groupName, nameLength, *status);
+}
+
  //------------------------------------------------------------------------------
  //
  //    uregex_group
@@ -1285,6 +1315,8 @@ U_NAMESPACE_END
  
  static const UChar BACKSLASH  = 0x5c;
  static const UChar DOLLARSIGN = 0x24;
+static const UChar LEFTBRACKET = 0x7b;
+static const UChar RIGHTBRACKET = 0x7d;
  
  //
  //  Move a character to an output buffer, with bounds checking on the index.
@@ -1359,10 +1391,10 @@ int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
              matchStart = (int32_t)m->fMatchStart;
          } else {
              // !!!: Would like a better way to do this!
-            UErrorCode status = U_ZERO_ERROR;
-            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &status);
-            status = U_ZERO_ERROR;
-            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &status);
+            UErrorCode tempStatus = U_ZERO_ERROR;
+            lastMatchEnd = utext_extract(m->fInputText, 0, m->fLastMatchEnd, NULL, 0, &tempStatus);
+            tempStatus = U_ZERO_ERROR;
+            matchStart = lastMatchEnd + utext_extract(m->fInputText, m->fLastMatchEnd, m->fMatchStart, NULL, 0, &tempStatus);
          }
          for (i=lastMatchEnd; i<matchStart; i++) {
              appendToBuf(regexp->fText[i], &destIdx, dest, capacity);
@@ -1377,7 +1409,7 @@ int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
  
      // scan the replacement text, looking for substitutions ($n) and \escapes.
      int32_t  replIdx = 0;
-    while (replIdx < replacementLength) {
+    while (replIdx < replacementLength && U_SUCCESS(*status)) {
          UChar  c = replacementText[replIdx];
          replIdx++;
          if (c != DOLLARSIGN && c != BACKSLASH) {
@@ -1426,55 +1458,84 @@ int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
              continue;
          }
  
+        // We've got a $.  Pick up the following capture group name or number.
+        // For numbers, consume only digits that produce a valid capture group for the pattern.
  
-
-        // We've got a $.  Pick up a capture group number if one follows.
-        // Consume at most the number of digits necessary for the largest capture
-        // number that is valid for this pattern.
-
-        int32_t numDigits = 0;
          int32_t groupNum  = 0;
-        UChar32 digitC;
-        for (;;) {
-            if (replIdx >= replacementLength) {
-                break;
-            }
-            U16_GET(replacementText, 0, replIdx, replacementLength, digitC);
-            if (u_isdigit(digitC) == FALSE) {
-                break;
-            }
+        U_ASSERT(c == DOLLARSIGN);
+        UChar32 c32;
+        U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+        if (u_isdigit(c32)) {
+            int32_t numDigits = 0;
+            int32_t numCaptureGroups = m->fPattern->fGroupMap->size();
+            for (;;) {
+                if (replIdx >= replacementLength) {
+                    break;
+                }
+                U16_GET(replacementText, 0, replIdx, replacementLength, c32);
+                if (u_isdigit(c32) == FALSE) {
+                    break;
+                }
  
+                int32_t digitVal = u_charDigitValue(c32);
+                if (groupNum * 10 + digitVal <= numCaptureGroups) {
+                    groupNum = groupNum * 10 + digitVal;
+                    U16_FWD_1(replacementText, replIdx, replacementLength);
+                    numDigits++;
+                } else {
+                    if (numDigits == 0) {
+                        *status = U_INDEX_OUTOFBOUNDS_ERROR;
+                    }
+                    break;
+                }
+            }
+        } else if (c32 == LEFTBRACKET) {
+            // Scan for Named Capture Group, ${name}.
+            UnicodeString groupName;
              U16_FWD_1(replacementText, replIdx, replacementLength);
-            groupNum=groupNum*10 + u_charDigitValue(digitC);
-            numDigits++;
-            if (numDigits >= m->fPattern->fMaxCaptureDigits) {
-                break;
+            while (U_SUCCESS(*status) && c32 != RIGHTBRACKET) { 
+                if (replIdx >= replacementLength) {
+                    *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                    break;
+                }
+                U16_NEXT(replacementText, replIdx, replacementLength, c32);
+                if ((c32 >= 0x41 && c32 <= 0x5a) ||           // A..Z
+                        (c32 >= 0x61 && c32 <= 0x7a) ||       // a..z
+                        (c32 >= 0x31 && c32 <= 0x39)) {       // 0..9
+                    groupName.append(c32);
+                } else if (c32 == RIGHTBRACKET) {
+                    groupNum = uhash_geti(regexp->fPat->fNamedCaptureMap, &groupName);
+                    if (groupNum == 0) {
+                        // Name not defined by pattern.
+                        *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                    }
+                } else {
+                    // Character was something other than a name char or a closing '}'
+                    *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
+                }
              }
+        } else {
+            // $ not followed by {name} or digits.
+            *status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
          }
  
  
-        if (numDigits == 0) {
-            // The $ didn't introduce a group number at all.
-            // Treat it as just part of the substitution text.
-            appendToBuf(DOLLARSIGN, &destIdx, dest, capacity);
-            continue;
-        }
-
          // Finally, append the capture group data to the destination.
-        destIdx += uregex_group((URegularExpression*)regexp, groupNum,
-                                dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
-        if (*status == U_BUFFER_OVERFLOW_ERROR) {
-            // Ignore buffer overflow when extracting the group.  We need to
-            //   continue on to get full size of the untruncated result.  We will
-            //   raise our own buffer overflow error at the end.
-            *status = U_ZERO_ERROR;
+        if (U_SUCCESS(*status)) {
+            destIdx += uregex_group((URegularExpression*)regexp, groupNum,
+                                    dest==NULL?NULL:&dest[destIdx], REMAINING_CAPACITY(destIdx, capacity), status);
+            if (*status == U_BUFFER_OVERFLOW_ERROR) {
+                // Ignore buffer overflow when extracting the group.  We need to
+                //   continue on to get full size of the untruncated result.  We will
+                //   raise our own buffer overflow error at the end.
+                *status = U_ZERO_ERROR;
+            }
          }
  
          if (U_FAILURE(*status)) {
-            // Can fail if group number is out of range.
+            // bad group number or name.
              break;
          }
-
      }
  
      //
@@ -1483,10 +1544,12 @@ int32_t RegexCImpl::appendReplacement(RegularExpression    *regexp,
      //
      if (destIdx < capacity) {
          dest[destIdx] = 0;
-    } else if (destIdx == *destCapacity) {
-        *status = U_STRING_NOT_TERMINATED_WARNING;
-    } else {
-        *status = U_BUFFER_OVERFLOW_ERROR;
+    } else if (U_SUCCESS(*status)) {
+        if (destIdx == *destCapacity) {
+            *status = U_STRING_NOT_TERMINATED_WARNING;
+        } else {
+            *status = U_BUFFER_OVERFLOW_ERROR;
+        }
      }
  
      //
diff --git a/icu4c/source/test/cintltst/reapits.c b/icu4c/source/test/cintltst/reapits.c

index 32f7f6bd2116de20f3e3ba40f4377dd47b45a164..adeb75cabe7a09ea5300360cee9af0941c605cf2 100644 (file)
--- a/icu4c/source/test/cintltst/reapits.c
+++ b/icu4c/source/test/cintltst/reapits.c
@@ -1022,7 +1022,7 @@ static void TestRegexCAPI(void) {
          TEST_ASSERT_SUCCESS(status);
          bufPtr = buf;
          bufCap = UPRV_LENGTHOF(buf);
-        u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", UPRV_LENGTHOF(repl));
+        u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ \\$ \\abc", UPRV_LENGTHOF(repl));
          uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
          TEST_ASSERT_SUCCESS(status);
          TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); 
@@ -1817,7 +1817,8 @@ static void TestUTextAPI(void) {
          UText   *result;
          const char str_Replxxx[] = { 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x20, 0x3c, 0x61, 0x61, 0x3e, 0x20, 0x78, 0x31, 0x78, 0x20, 0x78, 0x2e, 0x2e, 0x2e, 0x78, 0x2e, 0x00 }; /* Replace <aa> x1x x...x. */
          const char str_Nomatchhere[] = { 0x4e, 0x6f, 0x20, 0x6d, 0x61, 0x74, 0x63, 0x68, 0x20, 0x68, 0x65, 0x72, 0x65, 0x2e, 0x00 }; /* No match here. */
-        const char str_u00411U00000042a[] =  { 0x5c, 0x5c, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x31, 0x24, 0x31, 0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x34, 0x32, 0x24, 0x5c, 0x61, 0x00 }; /* \\\u0041$1\U00000042$\a */
+        const char str_u00411U00000042a[] =  { 0x5c, 0x5c, 0x5c, 0x75, 0x30, 0x30, 0x34, 0x31, 0x24, 0x31, 
+               0x5c, 0x55, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x34, 0x32, 0x5c, 0x24, 0x5c, 0x61, 0x00 }; /* \\\u0041$1\U00000042\$\a */
          const char str_1x[] = { 0x3c, 0x24, 0x31, 0x3e, 0x00 }; /* <$1> */
          const char str_ReplaceAaaBax1xxx[] = { 0x52, 0x65, 0x70, 0x6c, 0x61, 0x63, 0x65, 0x20, 0x5c, 0x41, 0x61, 0x61, 0x42, 0x24, 0x61, 0x20, 0x78, 0x31, 0x78, 0x20, 0x78, 0x2e, 0x2e, 0x2e, 0x78, 0x2e, 0x00 }; /* Replace \AaaB$a x1x x...x. */
          status = U_ZERO_ERROR;
@@ -1925,7 +1926,7 @@ static void TestUTextAPI(void) {
          TEST_ASSERT_SUCCESS(status);
          bufPtr = buf;
          bufCap = UPRV_LENGTHOF(buf);
-        u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ $ \\abc", UPRV_LENGTHOF(repl));
+        u_uastrncpy(repl, "abc\\u0041\\U00000042 \\\\ \\$ \\abc", UPRV_LENGTHOF(repl));
          uregex_appendReplacement(re, repl, -1, &bufPtr, &bufCap, &status);
          TEST_ASSERT_SUCCESS(status);
          TEST_ASSERT_STRING("abcAB \\ $ abc", buf, TRUE); 
diff --git a/icu4c/source/test/intltest/regextst.cpp b/icu4c/source/test/intltest/regextst.cpp

index ee287f2f32b039996ef12b9affdf1230856e0910..b3ad8ccdfbe84f504e094d844be996c7fa44728d 100644 (file)
--- a/icu4c/source/test/intltest/regextst.cpp
+++ b/icu4c/source/test/intltest/regextst.cpp
@@ -148,6 +148,15 @@ void RegexTest::runIndexedTest( int32_t index, UBool exec, const char* &name, ch
          case 25: name = "TestBug11371";
              if (exec) TestBug11371();
              break;
+        case 26: name = "TestBug11480";
+            if (exec) TestBug11480();
+            break;
+        case 27: name = "NamedCapture";
+            if (exec) NamedCapture();
+            break;
+        case 28: name = "NamedCaptureLimits";
+            if (exec) NamedCaptureLimits();
+            break;
          default: name = "";
              break; //needed to end loop
      }
@@ -1429,8 +1438,8 @@ void RegexTest::API_Replace() {
      REGEX_ASSERT(dest == "The value of $1 is bc.defg");
  
      dest = matcher2->replaceFirst("$ by itself, no group number $$$", status);
-    REGEX_CHECK_STATUS;
-    REGEX_ASSERT(dest == "$ by itself, no group number $$$defg");
+    REGEX_ASSERT(U_FAILURE(status));
+    status = U_ZERO_ERROR;
  
      UnicodeString replacement = UNICODE_STRING_SIMPLE("Supplemental Digit 1 $\\U0001D7CF.");
      replacement = replacement.unescape();
@@ -2633,7 +2642,9 @@ void RegexTest::API_Replace_UTF8() {
      REGEX_ASSERT(result == &destText);
      REGEX_ASSERT_UTEXT_UTF8(str_Thevalueof1isbcdefg, result);
  
-    const char str_byitselfnogroupnumber[] = { 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c, 0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62, 0x65, 0x72, 0x20, 0x24, 0x24, 0x24, 0x00 }; /* $ by itself, no group number $$$ */
+    const char str_byitselfnogroupnumber[] = { 0x5c, 0x24, 0x20, 0x62, 0x79, 0x20, 0x69, 0x74, 0x73, 0x65, 0x6c,
+               0x66, 0x2c, 0x20, 0x6e, 0x6f, 0x20, 0x67, 0x72, 0x6f, 0x75, 0x70, 0x20, 0x6e, 0x75, 0x6d, 0x62,
+               0x65, 0x72, 0x20, 0x5c, 0x24, 0x5c, 0x24, 0x5c, 0x24, 0x00 }; /* \$ by itself, no group number \$\$\$ */
      utext_openUTF8(&replText, str_byitselfnogroupnumber, -1, &status);
      result = matcher2->replaceFirst(&replText, NULL, status);
      REGEX_CHECK_STATUS;
@@ -3108,7 +3119,7 @@ void RegexTest::API_Pattern_UTF8() {
          UnicodeString stringToSplit("first:second:third");
          UText *textToSplit = utext_openUnicodeString(NULL, &stringToSplit, &status);
          REGEX_CHECK_STATUS;
-        
+
          UText *splits[10] = {NULL};
          int32_t numFields = matcher.split(textToSplit, splits, UPRV_LENGTHOF(splits), status);
          REGEX_CHECK_STATUS;
@@ -5137,7 +5148,7 @@ void RegexTest::PreAllocatedUTextCAPI () {
  
          /* Unicode escapes */
          uregex_setText(re, text1, -1, &status);
-        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042$\\a", -1, &status);
+        regextst_openUTF8FromInvariant(&replText, "\\\\\\u0041$1\\U00000042\\$\\a", -1, &status);
          utext_replace(&bufferText, 0, utext_nativeLength(&bufferText), NULL, 0, &status);
          result = uregex_replaceFirstUText(re, &replText, &bufferText, &status);
          REGEX_CHECK_STATUS;
@@ -5196,6 +5207,276 @@ void RegexTest::PreAllocatedUTextCAPI () {
      utext_close(&patternText);
  }
  
+
+//--------------------------------------------------------------
+//
+//  NamedCapture   Check basic named capture group functionality
+//
+//--------------------------------------------------------------
+void RegexTest::NamedCapture() {
+    UErrorCode status = U_ZERO_ERROR;
+    RegexPattern *pat = RegexPattern::compile(UnicodeString(
+            "abc()()(?<three>xyz)(de)(?<five>hmm)(?<six>oh)f\\k<five>"), 0, status);
+    REGEX_CHECK_STATUS;
+    int32_t group = pat->groupNumberFromName("five", -1, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(5 == group);
+    group = pat->groupNumberFromName("three", -1, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(3 == group);
+
+    status = U_ZERO_ERROR;
+    group = pat->groupNumberFromName(UnicodeString("six"), status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(6 == group);
+
+    status = U_ZERO_ERROR;
+    group = pat->groupNumberFromName(UnicodeString("nosuch"), status);
+    U_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+
+    // After copying a pattern, named capture should still work in the copy.
+    RegexPattern *copiedPat = new RegexPattern(*pat);
+    REGEX_ASSERT(*copiedPat == *pat);
+    delete pat; pat = NULL;  // Delete original, copy should have no references back to it.
+
+    group = copiedPat->groupNumberFromName("five", -1, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(5 == group);
+    group = copiedPat->groupNumberFromName("three", -1, status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(3 == group);
+    delete copiedPat;
+
+    // ReplaceAll with named capture group.
+    status = U_ZERO_ERROR;
+    UnicodeString text("Substitution of <<quotes>> for <<double brackets>>");
+    RegexMatcher *m = new RegexMatcher(UnicodeString("<<(?<mid>.+?)>>"), text, 0, status);
+    REGEX_CHECK_STATUS;
+    // m.pattern().dumpPattern();
+    UnicodeString replacedText = m->replaceAll("'${mid}'", status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("Substitution of 'quotes' for 'double brackets'") == replacedText);
+    delete m;
+
+    // ReplaceAll, allowed capture group numbers.
+    text = UnicodeString("abcmxyz");
+    m = new RegexMatcher(UnicodeString("..(?<one>m)(.)(.)"), text, 0, status);
+    REGEX_CHECK_STATUS;
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$0>"), status);   // group 0, full match, is allowed.
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$1>"), status);      // group 1 by number.
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<${one}>"), status);   // group 1 by name.
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<m>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$2>"), status);   // group 2.
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<x>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$3>"), status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<y>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$4>"), status);
+    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$04>"), status);      // group 0, leading 0,
+    REGEX_CHECK_STATUS;                                                 //    trailing out-of-range 4 passes through.
+    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$000016>"), status);  // Consume leading zeroes. Don't consume digits
+    REGEX_CHECK_STATUS;                                                 //   that push group num out of range.
+    REGEX_ASSERT(UnicodeString("a<m6>z") == replacedText);              //   This is group 1.
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<$3$2$1${one}>"), status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<yxmm>z") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("$3$2$1${one}"), status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("ayxmmz") == replacedText);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<${noSuchName}>"), status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<${invalid-name}>"), status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("<${one"), status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+    replacedText  = m->replaceAll(UnicodeString("$not a capture group"), status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    delete m;
+
+    // Repeat the above replaceAll() tests using the plain C API, which
+    //  has a separate implementation internally.
+    //  TODO: factor out the test data.
+
+    status = U_ZERO_ERROR;
+    URegularExpression *re = uregex_openC("..(?<one>m)(.)(.)", 0, NULL, &status);
+    REGEX_CHECK_STATUS;
+    text = UnicodeString("abcmxyz");
+    uregex_setText(re, text.getBuffer(), text.length(), &status);
+    REGEX_CHECK_STATUS;
+
+    UChar resultBuf[100];
+    int32_t resultLength;
+    UnicodeString repl;
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$0>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<bcmxy>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$1>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<${one}>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<m>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$2>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<x>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$3>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<y>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$4>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_ASSERT(status == U_INDEX_OUTOFBOUNDS_ERROR);
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$04>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<bcmxy4>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$000016>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<m6>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<$3$2$1${one}>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("a<yxmm>z") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("$3$2$1${one}");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(UnicodeString("ayxmmz") == UnicodeString(resultBuf, resultLength));
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<${noSuchName}>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<${invalid-name}>");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("<${one");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    status = U_ZERO_ERROR;
+    repl = UnicodeString("$not a capture group");
+    resultLength = uregex_replaceAll(re, repl.getBuffer(), repl.length(), resultBuf, UPRV_LENGTHOF(resultBuf), &status);
+    REGEX_ASSERT(status == U_REGEX_INVALID_CAPTURE_GROUP_NAME);
+
+    uregex_close(re);
+}
+
+//--------------------------------------------------------------
+//
+//  NamedCaptureLimits   Patterns with huge numbers of named capture groups.
+//                       The point is not so much what the exact limit is,
+//                       but that a largish number doesn't hit bad non-linear performance,
+//                       and that exceeding the limit fails cleanly.
+//
+//--------------------------------------------------------------
+void RegexTest::NamedCaptureLimits() {
+    if (quick) {
+        logln("Skipping test. Runs in exhuastive mode only.");
+        return;
+    }
+    const int32_t goodLimit = 1000000;     // Pattern w this many groups builds successfully.
+    const int32_t failLimit = 10000000;    // Pattern exceeds internal limits, fails to compile.
+    char nnbuf[100];
+    UnicodeString pattern;
+    int32_t nn;
+
+    for (nn=1; nn<goodLimit; nn++) {
+        sprintf(nnbuf, "(?<nn%d>)", nn);
+        pattern.append(UnicodeString(nnbuf, -1, US_INV));
+    }
+    UErrorCode status = U_ZERO_ERROR;
+    RegexPattern *pat = RegexPattern::compile(pattern, 0, status);
+    REGEX_CHECK_STATUS;
+    for (nn=1; nn<goodLimit; nn++) {
+        sprintf(nnbuf, "nn%d", nn);
+        int32_t groupNum = pat->groupNumberFromName(nnbuf, -1, status);
+        REGEX_ASSERT(nn == groupNum);
+        if (nn != groupNum) {
+            break;
+        }
+    }
+    delete pat;
+
+    pattern.remove();
+    for (nn=1; nn<failLimit; nn++) {
+        sprintf(nnbuf, "(?<nn%d>)", nn);
+        pattern.append(UnicodeString(nnbuf, -1, US_INV));
+    }
+    status = U_ZERO_ERROR;
+    pat = RegexPattern::compile(pattern, 0, status);
+    REGEX_ASSERT(status == U_REGEX_PATTERN_TOO_BIG);
+    delete pat;
+}
+
+
  //--------------------------------------------------------------
  //
  //  Bug7651   Regex pattern that exceeds default operator stack depth in matcher.
@@ -5487,5 +5768,26 @@ void RegexTest::TestBug11371() {
      }
  }
  
-#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
+void RegexTest::TestBug11480() {
+    // C API, get capture group of a group that does not participate in the match.
+    //        (Returns a zero length string, with nul termination,
+    //         indistinguishable from a group with a zero lenght match.)
  
+    UErrorCode status = U_ZERO_ERROR;
+    URegularExpression *re = uregex_openC("(A)|(B)", 0, NULL, &status);
+    REGEX_CHECK_STATUS;
+    UnicodeString text = UNICODE_STRING_SIMPLE("A");
+    uregex_setText(re, text.getBuffer(), text.length(), &status);
+    REGEX_CHECK_STATUS;
+    REGEX_ASSERT(uregex_lookingAt(re, 0, &status));
+    UChar buf[10] = {(UChar)13, (UChar)13, (UChar)13, (UChar)13};
+    int32_t length = uregex_group(re, 2, buf+1, UPRV_LENGTHOF(buf)-1, &status);
+    REGEX_ASSERT(length == 0);
+    REGEX_ASSERT(buf[0] == 13);
+    REGEX_ASSERT(buf[1] == 0);
+    REGEX_ASSERT(buf[2] == 13);
+    uregex_close(re);
+}
+
+
+#endif  /* !UCONFIG_NO_REGULAR_EXPRESSIONS  */
diff --git a/icu4c/source/test/intltest/regextst.h b/icu4c/source/test/intltest/regextst.h

index 38cc4ef19d66aecb2bb843cfcebbabcf9984d2e1..0461df1ad61d953eb5fe00fbc7a8145bc8658f76 100644 (file)
--- a/icu4c/source/test/intltest/regextst.h
+++ b/icu4c/source/test/intltest/regextst.h
@@ -1,6 +1,6 @@
  /********************************************************************
   * COPYRIGHT:
- * Copyright (c) 2002-2014, International Business Machines Corporation and
+ * Copyright (c) 2002-2015, International Business Machines Corporation and
   * others. All Rights Reserved.
   ********************************************************************/
  
@@ -41,6 +41,8 @@ public:
      virtual void API_Replace_UTF8();
      virtual void PerlTestsUTF8();
      virtual void PreAllocatedUTextCAPI();
+    virtual void NamedCapture();
+    virtual void NamedCaptureLimits();
      virtual void Bug7651();
      virtual void Bug7740();
      virtual void Bug8479();
@@ -51,6 +53,7 @@ public:
      virtual void TestCaseInsensitiveStarters();
      virtual void TestBug11049();
      virtual void TestBug11371();
+    virtual void TestBug11480();
      
      // The following functions are internal to the regexp tests.
      virtual void assertUText(const char *expected, UText *actual, const char *file, int line);
diff --git a/icu4c/source/test/testdata/regextst.txt b/icu4c/source/test/testdata/regextst.txt

index 51b5fa3bf3e06106e266a6a2b4385cb878d34c90..e0f8b27d758650be96117710e2919eda20f92d6c 100644 (file)
--- a/icu4c/source/test/testdata/regextst.txt
+++ b/icu4c/source/test/testdata/regextst.txt
@@ -1,4 +1,4 @@
-# Copyright (c) 2001-2014 International Business Machines
+# Copyright (c) 2001-2015 International Business Machines
  # Corporation and others. All Rights Reserved.
  #
  #  file:
@@ -513,6 +513,15 @@
  "ab(?:(c)|(d))\1"            i "abde"
  "ab(?:(c)|(d))\1"            i "<0>ab<1>c</1>c</0>e"
  
+# Named back references
+"(?<one>abcd)\k<one>"          "<0><1>abcd</1>abcd</0>"
+"(no)?(?<one>abcd)\k<one>"     "<0><2>abcd</2>abcd</0>"
+
+"(?<a_1>...)"               E  "  "   # backref names are ascii letters & numbers only"
+"(?<1a>...)"                E  "  "   # backref names must begin with a letter"
+"(?<a>.)(?<a>.)"            E  "  "   # Repeated names are illegal.
+
+
  # Case Insensitive
  "aBc"                    i      "<0>ABC</0>"
  "a[^bc]d"                i      "ABD"
author	Andy Heninger <andy.heninger@gmail.com>
	Wed, 18 Feb 2015 23:56:19 +0000 (23:56 +0000)
committer	Andy Heninger <andy.heninger@gmail.com>
	Wed, 18 Feb 2015 23:56:19 +0000 (23:56 +0000)
icu4c/source/common/unicode/utypes.h		patch \| blob \| history
icu4c/source/common/utypes.c		patch \| blob \| history
icu4c/source/i18n/regexcmp.cpp		patch \| blob \| history
icu4c/source/i18n/regexcmp.h		patch \| blob \| history
icu4c/source/i18n/regexcst.h		patch \| blob \| history
icu4c/source/i18n/regexcst.txt		patch \| blob \| history
icu4c/source/i18n/regexst.cpp		patch \| blob \| history
icu4c/source/i18n/rematch.cpp		patch \| blob \| history
icu4c/source/i18n/repattrn.cpp		patch \| blob \| history
icu4c/source/i18n/unicode/regex.h		patch \| blob \| history
icu4c/source/i18n/unicode/uregex.h		patch \| blob \| history
icu4c/source/i18n/uregex.cpp		patch \| blob \| history
icu4c/source/test/cintltst/reapits.c		patch \| blob \| history
icu4c/source/test/intltest/regextst.cpp		patch \| blob \| history
icu4c/source/test/intltest/regextst.h		patch \| blob \| history
icu4c/source/test/testdata/regextst.txt		patch \| blob \| history