From: K.Kosako Date: Fri, 22 Mar 2019 05:38:39 +0000 (+0900) Subject: remove OP_EXTENDED_GRAPHEME_CLUSTER and add OP_TEXT_SEGMENT_BOUNDARY X-Git-Tag: v6.9.2_rc1~59 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3a8303473bd973163ced0f84bb49e0e8368b278b;p=onig remove OP_EXTENDED_GRAPHEME_CLUSTER and add OP_TEXT_SEGMENT_BOUNDARY --- diff --git a/src/regcomp.c b/src/regcomp.c index 5001de7..cc6eaa2 100644 --- a/src/regcomp.c +++ b/src/regcomp.c @@ -1567,11 +1567,11 @@ compile_anchor_node(AnchorNode* node, regex_t* reg, ScanEnv* env) #endif case ANCR_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: - r = add_op(reg, OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY); - break; - case ANCR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: - r = add_op(reg, OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY); + r = add_op(reg, OP_TEXT_SEGMENT_BOUNDARY); + if (r != 0) return r; + COP(reg)->text_segment_boundary.not = + (node->type == ANCR_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY ? 1 : 0); break; case ANCR_PREC_READ: diff --git a/src/regexec.c b/src/regexec.c index 21181f2..bad4567 100644 --- a/src/regexec.c +++ b/src/regexec.c @@ -185,8 +185,7 @@ static OpInfoType OpInfo[] = { { OP_NO_WORD_BOUNDARY, "not-word-boundary" }, { OP_WORD_BEGIN, "word-begin" }, { OP_WORD_END, "word-end" }, - { OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, "extended-grapheme-cluster-boundary" }, - { OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, "no-extended-grapheme-cluster-boundary" }, + { OP_TEXT_SEGMENT_BOUNDARY, "text-segment-boundary" }, { OP_BEGIN_BUF, "begin-buf" }, { OP_END_BUF, "end-buf" }, { OP_BEGIN_LINE, "begin-line" }, @@ -577,6 +576,11 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, break; #endif + case OP_TEXT_SEGMENT_BOUNDARY: + if (p->text_segment_boundary.not != 0) + fprintf(f, ":not"); + break; + case OP_FINISH: case OP_END: case OP_ANYCHAR: @@ -587,8 +591,6 @@ print_compiled_byte_code(FILE* f, regex_t* reg, int index, case OP_WORD_ASCII: case OP_NO_WORD: case OP_NO_WORD_ASCII: - case OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: - case OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: case OP_BEGIN_BUF: case OP_END_BUF: case OP_BEGIN_LINE: @@ -2495,8 +2497,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, &&L_NO_WORD_BOUNDARY, &&L_WORD_BEGIN, &&L_WORD_END, - &&L_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, - &&L_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, + &&L_TEXT_SEGMENT_BOUNDARY, &&L_BEGIN_BUF, &&L_END_BUF, &&L_BEGIN_LINE, @@ -3250,19 +3251,30 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, goto fail; #endif - CASE_OP(EXTENDED_GRAPHEME_CLUSTER_BOUNDARY) - if (onigenc_egcb_is_break_position(encode, s, sprev, str, end)) { - INC_OP; - JUMP_OUT; - } - goto fail; + CASE_OP(TEXT_SEGMENT_BOUNDARY) + { + int is_break; - CASE_OP(NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY) - if (onigenc_egcb_is_break_position(encode, s, sprev, str, end)) - goto fail; + switch (p->text_segment_boundary.type) { + case EXTENDED_GRAPHEME_CLUSTER_BOUNDARY: + is_break = onigenc_egcb_is_break_position(encode, s, sprev, str, end); + break; + default: + goto bytecode_error; + break; + } - INC_OP; - JUMP_OUT; + if (p->text_segment_boundary.not != 0) + is_break = ! is_break; + + if (is_break != 0) { + INC_OP; + JUMP_OUT; + } + else { + goto fail; + } + } CASE_OP(BEGIN_BUF) if (! ON_STR_BEGIN(s)) goto fail; diff --git a/src/regint.h b/src/regint.h index b6892e3..783e3b3 100644 --- a/src/regint.h +++ b/src/regint.h @@ -533,8 +533,7 @@ enum OpCode { OP_WORD_BEGIN, OP_WORD_END, - OP_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, - OP_NO_EXTENDED_GRAPHEME_CLUSTER_BOUNDARY, + OP_TEXT_SEGMENT_BOUNDARY, OP_BEGIN_BUF, OP_END_BUF, @@ -615,6 +614,11 @@ enum UpdateVarType { UPDATE_VAR_RIGHT_RANGE_INIT = 4, }; +enum TextSegmentBoundaryType { + EXTENDED_GRAPHEME_CLUSTER_BOUNDARY = 0, + WORD_BOUNDARY = 1, +}; + typedef int RelAddrType; typedef int AbsAddrType; typedef int LengthType; @@ -837,6 +841,10 @@ typedef struct { struct { ModeType mode; } word_boundary; /* OP_WORD_BOUNDARY, OP_NO_WORD_BOUNDARY, OP_WORD_BEGIN, OP_WORD_END */ + struct { + enum TextSegmentBoundaryType type; + int not; + } text_segment_boundary; struct { union { MemNumType n1; /* num == 1 */