]> granicus.if.org Git - onig/commitdiff
implement ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER and ONIG_OPTION_TEXT_SEG...
authorK.Kosako <kkosako0@gmail.com>
Fri, 22 Mar 2019 07:46:13 +0000 (16:46 +0900)
committerK.Kosako <kkosako0@gmail.com>
Fri, 22 Mar 2019 07:46:13 +0000 (16:46 +0900)
src/regcomp.c
src/regparse.c

index dfba57b8562882cd7ecd2aeb2b77e009c1bcc10e..906136cd909a520fc88d13f81774e9a35b4362da 100644 (file)
@@ -1568,10 +1568,22 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env)
 
   case ANCR_TEXT_SEGMENT_BOUNDARY:
   case ANCR_NO_TEXT_SEGMENT_BOUNDARY:
-    r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
-    if (r != 0) return r;
-    COP(reg)->text_segment_boundary.not =
-      (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
+    {
+      enum TextSegmentBoundaryType type;
+
+      r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY);
+      if (r != 0) return r;
+
+      type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY;
+#ifdef USE_UNICODE_WORD_BREAK
+      if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD))
+        type = WORD_BOUNDARY;
+#endif
+
+      COP(reg)->text_segment_boundary.type = type;
+      COP(reg)->text_segment_boundary.not =
+        (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0);
+    }
     break;
 
   case ANCR_PREC_READ:
index 29536edb2ce08ec39a8b583f63122e8cb9f9a1e5..4bc2d1136fd362031301457dc1595d5854dc17a7 100644 (file)
@@ -7437,6 +7437,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
 #endif
     case '-': case 'i': case 'm': case 's': case 'x':
     case 'W': case 'D': case 'S': case 'P':
+    case 'y':
       {
         int neg = 0;
 
@@ -7477,6 +7478,40 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
           case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break;
           case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break;
 
+          case 'y': /* y{g}, y{w} */
+            {
+              if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION;
+
+              if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
+              if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION;
+              PFETCH(c);
+              if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
+              PFETCH(c);
+              switch (c) {
+              case 'g':
+                OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0);
+                OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1);
+                break;
+#ifdef USE_UNICODE_WORD_BREAK
+              case 'w':
+                if (! ONIGENC_IS_UNICODE_ENCODING(enc))
+                  return ONIGERR_UNDEFINED_GROUP_OPTION;
+
+                OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0);
+                OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1);
+                break;
+#endif
+              default:
+                return ONIGERR_UNDEFINED_GROUP_OPTION;
+                break;
+              }
+              if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
+              PFETCH(c);
+              if (c != '}')
+                return ONIGERR_UNDEFINED_GROUP_OPTION;
+              break;
+            } /* case 'y' */
+
           default:
             return ONIGERR_UNDEFINED_GROUP_OPTION;
           }
@@ -7508,7 +7543,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end,
 
           if (PEND) return ONIGERR_END_PATTERN_IN_GROUP;
           PFETCH(c);
-        }
+        } /* while (1) */
       }
       break;