From: K.Kosako Date: Fri, 22 Mar 2019 07:46:13 +0000 (+0900) Subject: implement ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER and ONIG_OPTION_TEXT_SEG... X-Git-Tag: v6.9.2_rc1~52 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e105f8e60fe75130693c48b12d7e40cd8a397905;p=onig implement ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER and ONIG_OPTION_TEXT_SEGMENT_WORD (y{g}, y{w}) --- diff --git a/src/regcomp.c b/src/regcomp.c index dfba57b..906136c 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -1568,10 +1568,22 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) case ANCR_TEXT_SEGMENT_BOUNDARY: case ANCR_NO_TEXT_SEGMENT_BOUNDARY: - r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY); - if (r != 0) return r; - COP(reg)->text_segment_boundary.not = - (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0); + { + enum TextSegmentBoundaryType type; + + r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY); + if (r != 0) return r; + + type = EXTENDED_GRAPHEME_CLUSTER_BOUNDARY; +#ifdef USE_UNICODE_WORD_BREAK + if (ONIG_IS_OPTION_ON(reg->options, ONIG_OPTION_TEXT_SEGMENT_WORD)) + type = WORD_BOUNDARY; +#endif + + COP(reg)->text_segment_boundary.type = type; + COP(reg)->text_segment_boundary.not = + (node->type == ANCR_NO_TEXT_SEGMENT_BOUNDARY ? 1 : 0); + } break; case ANCR_PREC_READ: diff --git a/src/regparse.c b/src/regparse.c index 29536ed..4bc2d11 100644 --- a/src/regparse.c +++ b/src/regparse.c @@ -7437,6 +7437,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, #endif case '-': case 'i': case 'm': case 's': case 'x': case 'W': case 'D': case 'S': case 'P': + case 'y': { int neg = 0; @@ -7477,6 +7478,40 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, case 'S': OPTION_NEGATE(option, ONIG_OPTION_SPACE_IS_ASCII, neg); break; case 'P': OPTION_NEGATE(option, ONIG_OPTION_POSIX_IS_ASCII, neg); break; + case 'y': /* y{g}, y{w} */ + { + if (neg != 0) return ONIGERR_UNDEFINED_GROUP_OPTION; + + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + if (! PPEEK_IS('{')) return ONIGERR_UNDEFINED_GROUP_OPTION; + PFETCH(c); + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + switch (c) { + case 'g': + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 0); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 1); + break; +#ifdef USE_UNICODE_WORD_BREAK + case 'w': + if (! ONIGENC_IS_UNICODE_ENCODING(enc)) + return ONIGERR_UNDEFINED_GROUP_OPTION; + + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_WORD, 0); + OPTION_NEGATE(option, ONIG_OPTION_TEXT_SEGMENT_EXTENDED_GRAPHEME_CLUSTER, 1); + break; +#endif + default: + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + } + if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; + PFETCH(c); + if (c != '}') + return ONIGERR_UNDEFINED_GROUP_OPTION; + break; + } /* case 'y' */ + default: return ONIGERR_UNDEFINED_GROUP_OPTION; } @@ -7508,7 +7543,7 @@ parse_bag(Node** np, PToken* tok, int term, UChar** src, UChar* end, if (PEND) return ONIGERR_END_PATTERN_IN_GROUP; PFETCH(c); - } + } /* while (1) */ } break;