From: Seiji Masugata Date: Thu, 24 Aug 2006 17:03:47 +0000 (+0000) Subject: Updated bundled oniguruma library (used for multibyte regular expression) to 4.3.1. X-Git-Tag: php-5.2.0RC3~61 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=3ea21528397a5bd2d683c391dbced941ac631df5;p=php Updated bundled oniguruma library (used for multibyte regular expression) to 4.3.1. --- diff --git a/ext/mbstring/oniguruma/HISTORY b/ext/mbstring/oniguruma/HISTORY index 17b696f84c..f844b07658 100644 --- a/ext/mbstring/oniguruma/HISTORY +++ b/ext/mbstring/oniguruma/HISTORY @@ -1,5 +1,61 @@ History +2006/08/21: Version 4.3.1 + +2006/08/21: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/21: [impl] change stack type values + and re-define STK_MASK_TO_VOID_TARGET etc... +2006/08/21: [impl] set repeat_range[].upper to 0x7fffffff as infinite. +2006/08/21: [impl] add STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE. +2006/08/21: [impl] reduce (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} +2006/09/21: [impl] reduce (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} + if backreference is not used. +2006/08/17: [bug] should check scan_env.num_call > 0 for backrefed pattern + in combination explosion check. + +2006/08/17: Version 4.3.0 + +2006/08/17: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/17: [new] add config USE_COMBINATION_EXPLOSION_CHECK. + check /(.+)*/, /(\s*foo\s*)*/ etc... + [API] add num_comb_exp_check member in regex_t. + [dist] change LTVERSION value to "1:0:0" in configure.in. +2006/08/15: [bug] OP_REPEAT_INC process in match_at(). + should check repeat-count >= range-upper and + range-upper may be infinite. + +2006/08/11: Version 4.2.3 + +2006/08/11: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/10: [impl] remove double call in set_qualifier(). +2006/08/10: [impl] remove by_number member in QualifierNode. +2006/08/09: [impl] remove a comma at the end of enum ReduceType + for escape warning on Mac OS X. +2006/08/07: [impl] remove warning in regcomp.c. +2006/08/07: [spec] move definition of USE_BACKREF_AT_LEVEL into NOT_RUBY. + +2006/08/03: Version 4.2.2 + +2006/08/03: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/08/03: [bug] (thanks Hiroyuki Yamamoto) + segmentation fault in regexec(). (POSIX API) +2006/08/02: [bug] combination of \G in look-ahead/look-behind and other + anchors(\A, \z, \Z) cause invalid result. + ex. /(?!\G)a\z/.match("ba") + start arg. of MATCH_ARG_INIT() should be original + arg. of onig_search(). + +2006/07/31: Version 4.2.1 + +2006/07/31: [test] success in ruby 1.9.0 (2006-07-28) [i686-linux]. +2006/07/31: [bug] (thanks Kimura Minoru) + re-implement bm_search_notrev(). +2006/07/31: [impl] bm_search_notrev() refactoring. +2006/07/31: [bug] (thanks Kimura Minoru) + fix incomplete multibyte string in exact info. +2006/07/31: [impl] (thanks Seiji Masugata) + remove cast in va_init_list() for Intel C Compiler. + 2006/07/18: Version 4.2.0 2006/07/18: [test] success in ruby 1.9.0 (2006-03-01) [i686-linux]. diff --git a/ext/mbstring/oniguruma/index.html b/ext/mbstring/oniguruma/index.html index a2d6c97b97..f72d9f2294 100755 --- a/ext/mbstring/oniguruma/index.html +++ b/ext/mbstring/oniguruma/index.html @@ -8,7 +8,7 @@

Oniguruma

-2006/07/18 (C) K.Kosako +2006/08/21 (C) K.Kosako

@@ -37,8 +37,8 @@ ISO-8859-11, ISO-8859-13, ISO-8859-14, ISO-8859-15, ISO-8859-16

What's new @@ -70,13 +70,13 @@ It follows the BSD license in the case of the one except for it.
Download:
@@ -87,7 +87,7 @@ It follows the BSD license in the case of the one except for it.

-
Documents: (version 4.2.0) +
Documents: (version 4.3.1)
@@ -158,7 +159,6 @@ It follows the BSD license in the case of the one except for it.
  • Regular expressions memo (Japanese page)
  • Regular expressions technique (Japanese page)
  • "Text Processing" Lecture documents (Tanaka Akira) (Japanese page) -
  • "Regex library in Ruby 1.9/2.0" Japan Ruby Conference 2006 (K.Kosako) (Japanese)
    diff --git a/ext/mbstring/oniguruma/oniguruma.h b/ext/mbstring/oniguruma/oniguruma.h index ef8a49f7da..92c67c858e 100644 --- a/ext/mbstring/oniguruma/oniguruma.h +++ b/ext/mbstring/oniguruma/oniguruma.h @@ -37,8 +37,8 @@ extern "C" { #define ONIGURUMA #define ONIGURUMA_VERSION_MAJOR 4 -#define ONIGURUMA_VERSION_MINOR 2 -#define ONIGURUMA_VERSION_TEENY 0 +#define ONIGURUMA_VERSION_MINOR 3 +#define ONIGURUMA_VERSION_TEENY 1 #ifdef __cplusplus # ifndef HAVE_PROTOTYPES @@ -744,6 +744,7 @@ typedef struct re_pattern_buffer { int num_mem; /* used memory(...) num counted from 1 */ int num_repeat; /* OP_REPEAT/OP_REPEAT_NG id-counter */ int num_null_check; /* OP_NULL_CHECK_START/END id counter */ + int num_comb_exp_check; /* combination explosion check */ int num_call; /* number of subexp call */ unsigned int capture_history; /* (?@...) flag (1-31) */ unsigned int bt_mem_start; /* need backtrack flag */ diff --git a/ext/mbstring/oniguruma/regcomp.c b/ext/mbstring/oniguruma/regcomp.c index db58be72f0..507faee8e2 100644 --- a/ext/mbstring/oniguruma/regcomp.c +++ b/ext/mbstring/oniguruma/regcomp.c @@ -186,6 +186,15 @@ add_opcode(regex_t* reg, int opcode) return 0; } +static int +add_state_check_num(regex_t* reg, int num) +{ + StateCheckNumType n = (StateCheckNumType )num; + + BBUF_ADD(reg, &n, SIZE_STATE_CHECK_NUM); + return 0; +} + static int add_rel_addr(regex_t* reg, int addr) { @@ -644,7 +653,7 @@ entry_repeat_range(regex_t* reg, int id, int lower, int upper) } p[id].lower = lower; - p[id].upper = upper; + p[id].upper = (IS_REPEAT_INFINITE(upper) ? 0x7fffffff : upper); return 0; } @@ -684,7 +693,258 @@ compile_range_repeat_node(QualifierNode* qn, int target_len, int empty_info, return r; } +static int +is_anychar_star_qualifier(QualifierNode* qn) +{ + if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && + NTYPE(qn->target) == N_ANYCHAR) + return 1; + else + return 0; +} + #define QUALIFIER_EXPAND_LIMIT_SIZE 50 +#define CKN_ON (ckn > 0) + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +static int +compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int len, mod_tlen, cklen; + int ckn; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); + + cklen = (CKN_ON ? SIZE_STATE_CHECK_NUM: 0); + + /* anychar repeat */ + if (NTYPE(qn->target) == N_ANYCHAR) { + if (qn->greedy && infinite) { + if (IS_NOT_NULL(qn->next_head_exact)) + return SIZE_OP_ANYCHAR_STAR_PEEK_NEXT + tlen * qn->lower + cklen; + else + return SIZE_OP_ANYCHAR_STAR + tlen * qn->lower + cklen; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && qn->lower <= 1) { + if (qn->greedy) { + if (qn->lower == 1) + len = SIZE_OP_JUMP; + else + len = 0; + + len += SIZE_OP_PUSH + cklen + mod_tlen + SIZE_OP_JUMP; + } + else { + if (qn->lower == 0) + len = SIZE_OP_JUMP; + else + len = 0; + + len += mod_tlen + SIZE_OP_PUSH + cklen; + } + } + else if (qn->upper == 0) { + if (qn->is_refered != 0) /* /(?..){0}/ */ + len = SIZE_OP_JUMP + tlen; + else + len = 0; + } + else if (qn->upper == 1 && qn->greedy) { + if (qn->lower == 0) { + if (CKN_ON) { + len = SIZE_OP_STATE_CHECK_PUSH + tlen; + } + else { + len = SIZE_OP_PUSH + tlen; + } + } + else { + len = tlen; + } + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + len = SIZE_OP_PUSH + cklen + SIZE_OP_JUMP + tlen; + } + else { + len = SIZE_OP_REPEAT_INC + + mod_tlen + SIZE_OPCODE + SIZE_RELADDR + SIZE_MEMNUM; + if (CKN_ON) + len += SIZE_OP_STATE_CHECK; + } + + return len; +} + +static int +compile_qualifier_node(QualifierNode* qn, regex_t* reg) +{ + int r, mod_tlen; + int ckn; + int infinite = IS_REPEAT_INFINITE(qn->upper); + int empty_info = qn->target_empty_info; + int tlen = compile_length_tree(qn->target, reg); + + if (tlen < 0) return tlen; + + ckn = ((reg->num_comb_exp_check > 0) ? qn->comb_exp_check_num : 0); + + if (is_anychar_star_qualifier(qn)) { + r = compile_tree_n_times(qn->target, qn->lower, reg); + if (r) return r; + if (IS_NOT_NULL(qn->next_head_exact)) { + if (IS_MULTILINE(reg->options)) + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT + : OP_ANYCHAR_ML_STAR_PEEK_NEXT)); + else + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT + : OP_ANYCHAR_STAR_PEEK_NEXT)); + if (r) return r; + if (CKN_ON) { + r = add_state_check_num(reg, ckn); + if (r) return r; + } + + return add_bytes(reg, NSTRING(qn->next_head_exact).s, 1); + } + else { + if (IS_MULTILINE(reg->options)) { + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_ML_STAR + : OP_ANYCHAR_ML_STAR)); + } + else { + r = add_opcode(reg, (CKN_ON ? + OP_STATE_CHECK_ANYCHAR_STAR + : OP_ANYCHAR_STAR)); + } + if (r) return r; + if (CKN_ON) + r = add_state_check_num(reg, ckn); + + return r; + } + } + + if (empty_info != 0) + mod_tlen = tlen + (SIZE_OP_NULL_CHECK_START + SIZE_OP_NULL_CHECK_END); + else + mod_tlen = tlen; + + if (infinite && qn->lower <= 1) { + if (qn->greedy) { + if (qn->lower == 1) { + r = add_opcode_rel_addr(reg, OP_JUMP, + (CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH)); + if (r) return r; + } + + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, mod_tlen + SIZE_OP_JUMP); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, mod_tlen + SIZE_OP_JUMP); + } + if (r) return r; + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, + -(mod_tlen + (int )SIZE_OP_JUMP + + (int )(CKN_ON ? SIZE_OP_STATE_CHECK_PUSH : SIZE_OP_PUSH))); + } + else { + if (qn->lower == 0) { + r = add_opcode_rel_addr(reg, OP_JUMP, mod_tlen); + if (r) return r; + } + r = compile_tree_empty_check(qn->target, reg, empty_info); + if (r) return r; + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH_OR_JUMP); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, + -(mod_tlen + (int )SIZE_OP_STATE_CHECK_PUSH_OR_JUMP)); + } + else + r = add_opcode_rel_addr(reg, OP_PUSH, -(mod_tlen + (int )SIZE_OP_PUSH)); + } + } + else if (qn->upper == 0) { + if (qn->is_refered != 0) { /* /(?..){0}/ */ + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else + r = 0; + } + else if (qn->upper == 1 && qn->greedy) { + if (qn->lower == 0) { + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, tlen); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, tlen); + } + if (r) return r; + } + + r = compile_tree(qn->target, reg); + } + else if (!qn->greedy && qn->upper == 1 && qn->lower == 0) { /* '??' */ + if (CKN_ON) { + r = add_opcode(reg, OP_STATE_CHECK_PUSH); + if (r) return r; + r = add_state_check_num(reg, ckn); + if (r) return r; + r = add_rel_addr(reg, SIZE_OP_JUMP); + } + else { + r = add_opcode_rel_addr(reg, OP_PUSH, SIZE_OP_JUMP); + } + + if (r) return r; + r = add_opcode_rel_addr(reg, OP_JUMP, tlen); + if (r) return r; + r = compile_tree(qn->target, reg); + } + else { + r = compile_range_repeat_node(qn, mod_tlen, empty_info, reg); + if (CKN_ON) { + if (r) return r; + r = add_opcode(reg, OP_STATE_CHECK); + if (r) return r; + r = add_state_check_num(reg, ckn); + } + } + return r; +} + +#else /* USE_COMBINATION_EXPLOSION_CHECK */ static int compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) @@ -751,16 +1011,6 @@ compile_length_qualifier_node(QualifierNode* qn, regex_t* reg) return len; } -static int -is_anychar_star_qualifier(QualifierNode* qn) -{ - if (qn->greedy && IS_REPEAT_INFINITE(qn->upper) && - NTYPE(qn->target) == N_ANYCHAR) - return 1; - else - return 0; -} - static int compile_qualifier_node(QualifierNode* qn, regex_t* reg) { @@ -887,6 +1137,7 @@ compile_qualifier_node(QualifierNode* qn, regex_t* reg) } return r; } +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ static int compile_length_option_node(EffectNode* node, regex_t* reg) @@ -1435,7 +1686,9 @@ compile_tree(Node* node, regex_t* reg) } if (r) return r; +#ifdef USE_BACKREF_AT_LEVEL add_bacref_mems: +#endif r = add_length(reg, br->back_num); if (r) return r; p = BACKREFS_P(br); @@ -3040,6 +3293,146 @@ divide_ambig_string_node(Node* node, regex_t* reg) return 0; } +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +#define CEC_THRES_NUM_BIG_REPEAT 512 +#define CEC_INFINITE_NUM 0x7fffffff + +#define CEC_IN_INFINITE_REPEAT (1<<0) +#define CEC_IN_FINITE_REPEAT (1<<1) +#define CEC_CONT_BIG_REPEAT (1<<2) + +static int +setup_comb_exp_check(Node* node, int state, ScanEnv* env) +{ + int type; + int r = state; + + type = NTYPE(node); + switch (type) { + case N_LIST: + { + Node* prev = NULL_NODE; + do { + r = setup_comb_exp_check(NCONS(node).left, r, env); + prev = NCONS(node).left; + } while (r >= 0 && IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_ALT: + { + int ret; + do { + ret = setup_comb_exp_check(NCONS(node).left, state, env); + r |= ret; + } while (ret >= 0 && IS_NOT_NULL(node = NCONS(node).right)); + } + break; + + case N_QUALIFIER: + { + int child_state = state; + int add_state = 0; + QualifierNode* qn = &(NQUALIFIER(node)); + Node* target = qn->target; + int var_num; + + if (! IS_REPEAT_INFINITE(qn->upper)) { + if (qn->upper > 1) { + /* {0,1}, {1,1} are allowed */ + child_state |= CEC_IN_FINITE_REPEAT; + + /* check (a*){n,m}, (a+){n,m} => (a*){n,n}, (a+){n,n} */ + if (env->backrefed_mem == 0) { + if (NTYPE(qn->target) == N_EFFECT) { + EffectNode* en = &(NEFFECT(qn->target)); + if (en->type == EFFECT_MEMORY) { + if (NTYPE(en->target) == N_QUALIFIER) { + QualifierNode* q = &(NQUALIFIER(en->target)); + if (IS_REPEAT_INFINITE(q->upper) + && q->greedy == qn->greedy) { + qn->upper = (qn->lower == 0 ? 1 : qn->lower); + if (qn->upper == 1) + child_state = state; + } + } + } + } + } + } + } + + if (state & CEC_IN_FINITE_REPEAT) { + qn->comb_exp_check_num = -1; + } + else { + if (IS_REPEAT_INFINITE(qn->upper)) { + var_num = CEC_INFINITE_NUM; + child_state |= CEC_IN_INFINITE_REPEAT; + } + else { + var_num = qn->upper - qn->lower; + } + + if (var_num >= CEC_THRES_NUM_BIG_REPEAT) + add_state |= CEC_CONT_BIG_REPEAT; + + if (((state & CEC_IN_INFINITE_REPEAT) != 0 && var_num != 0) || + ((state & CEC_CONT_BIG_REPEAT) != 0 && + var_num >= CEC_THRES_NUM_BIG_REPEAT)) { + if (qn->comb_exp_check_num == 0) { + env->num_comb_exp_check++; + qn->comb_exp_check_num = env->num_comb_exp_check; + if (env->curr_max_regnum > env->comb_exp_max_regnum) + env->comb_exp_max_regnum = env->curr_max_regnum; + } + } + } + + r = setup_comb_exp_check(target, child_state, env); + r |= add_state; + } + break; + + case N_EFFECT: + { + EffectNode* en = &(NEFFECT(node)); + + switch (en->type) { + case EFFECT_MEMORY: + { + if (env->curr_max_regnum < en->regnum) + env->curr_max_regnum = en->regnum; + + r = setup_comb_exp_check(en->target, state, env); + } + break; + + default: + r = setup_comb_exp_check(en->target, state, env); + break; + } + } + break; + +#ifdef USE_SUBEXP_CALL + case N_CALL: + if (IS_CALL_RECURSION(&(NCALL(node)))) + env->has_recursion = 1; + else + r = setup_comb_exp_check(NCALL(node).target, state, env); + break; +#endif + + default: + break; + } + + return r; +} +#endif + #define IN_ALT (1<<0) #define IN_NOT (1<<1) #define IN_REPEAT (1<<2) @@ -3600,9 +3993,10 @@ copy_opt_exact_info(OptExactInfo* to, OptExactInfo* from) } static void -concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) +concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add, OnigEncoding enc) { - int i, n; + int i, j, len; + UChar *p, *end; OptAncInfo tanc; if (! to->ignore_case && add->ignore_case) { @@ -3611,11 +4005,17 @@ concat_opt_exact_info(OptExactInfo* to, OptExactInfo* add) to->ignore_case = 1; } - for (i = to->len, n = 0; n < add->len && i < OPT_EXACT_MAXLEN; i++, n++) - to->s[i] = add->s[n]; + p = add->s; + end = p + add->len; + for (i = to->len; p < end; ) { + len = enc_len(enc, p); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len && p < end; j++) + to->s[i++] = *p++; + } to->len = i; - to->reach_end = (n == add->len ? add->reach_end : 0); + to->reach_end = (p == end ? add->reach_end : 0); concat_opt_anc_info(&tanc, &to->anc, &add->anc, 1, 1); if (! to->reach_end) tanc.right_anchor = 0; @@ -3630,15 +4030,10 @@ concat_opt_exact_info_str(OptExactInfo* to, UChar *p; for (i = to->len, p = s; p < end && i < OPT_EXACT_MAXLEN; ) { - if (raw) { + len = enc_len(enc, p); + if (i + len > OPT_EXACT_MAXLEN) break; + for (j = 0; j < len && p < end; j++) to->s[i++] = *p++; - } - else { - len = enc_len(enc, p); - if (i + len > OPT_EXACT_MAXLEN) break; - for (j = 0; j < len; j++) - to->s[i++] = *p++; - } } to->len = i; @@ -3903,11 +4298,11 @@ concat_left_node_opt_info(OnigEncoding enc, NodeOptInfo* to, NodeOptInfo* add) if (add->exb.len > 0) { if (exb_reach) { - concat_opt_exact_info(&to->exb, &add->exb); + concat_opt_exact_info(&to->exb, &add->exb, enc); clear_opt_exact_info(&add->exb); } else if (exm_reach) { - concat_opt_exact_info(&to->exm, &add->exb); + concat_opt_exact_info(&to->exm, &add->exb, enc); clear_opt_exact_info(&add->exb); } } @@ -4206,7 +4601,7 @@ optimize_node_left(Node* node, NodeOptInfo* opt, OptEnv* env) if (nopt.exb.reach_end) { for (i = 2; i < qn->lower && ! is_full_opt_exact_info(&opt->exb); i++) { - concat_opt_exact_info(&opt->exb, &nopt.exb); + concat_opt_exact_info(&opt->exb, &nopt.exb, env->enc); } if (i < qn->lower) { opt->exb.reach_end = 0; @@ -4744,6 +5139,9 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->num_null_check = 0; reg->repeat_range_alloc = 0; reg->repeat_range = (OnigRepeatRange* )NULL; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + reg->num_comb_exp_check = 0; +#endif r = onig_parse_make_tree(&root, pattern, pattern_end, reg, &scan_env); if (r != 0) goto err; @@ -4797,6 +5195,33 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end, reg->bt_mem_end |= reg->capture_history; } +#ifdef USE_COMBINATION_EXPLOSION_CHECK + if (scan_env.backrefed_mem == 0 +#ifdef USE_SUBEXP_CALL + || scan_env.num_call == 0 +#endif + ) { + setup_comb_exp_check(root, 0, &scan_env); +#ifdef USE_SUBEXP_CALL + if (scan_env.has_recursion != 0) { + scan_env.num_comb_exp_check = 0; + } + else +#endif + if (scan_env.comb_exp_max_regnum > 0) { + int i; + for (i = 1; i <= scan_env.comb_exp_max_regnum; i++) { + if (BIT_STATUS_AT(scan_env.backrefed_mem, i) != 0) { + scan_env.num_comb_exp_check = 0; + break; + } + } + } + } + + reg->num_comb_exp_check = scan_env.num_comb_exp_check; +#endif + clear_optimize_info(reg); #ifndef ONIG_DONT_OPTIMIZE r = set_optimize_info_from_tree(root, reg, &scan_env); @@ -5006,6 +5431,16 @@ onig_end() #ifdef ONIG_DEBUG +/* arguments type */ +#define ARG_SPECIAL -1 +#define ARG_NON 0 +#define ARG_RELADDR 1 +#define ARG_ABSADDR 2 +#define ARG_LENGTH 3 +#define ARG_MEMNUM 4 +#define ARG_OPTION 5 +#define ARG_STATE_CHECK 6 + OnigOpInfoType OnigOpInfo[] = { { OP_FINISH, "finish", ARG_NON }, { OP_END, "end", ARG_NON }, @@ -5036,63 +5471,73 @@ OnigOpInfoType OnigOpInfo[] = { { OP_ANYCHAR_ML_STAR, "anychar-ml*", ARG_NON }, { OP_ANYCHAR_STAR_PEEK_NEXT, "anychar*-peek-next", ARG_SPECIAL }, { OP_ANYCHAR_ML_STAR_PEEK_NEXT, "anychar-ml*-peek-next", ARG_SPECIAL }, - { OP_WORD, "word", ARG_NON }, - { OP_NOT_WORD, "not-word", ARG_NON }, - { OP_WORD_SB, "word-sb", ARG_NON }, - { OP_WORD_MB, "word-mb", ARG_NON }, - { OP_WORD_BOUND, "word-bound", ARG_NON }, - { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, - { OP_WORD_BEGIN, "word-begin", ARG_NON }, - { OP_WORD_END, "word-end", ARG_NON }, - { OP_BEGIN_BUF, "begin-buf", ARG_NON }, - { OP_END_BUF, "end-buf", ARG_NON }, - { OP_BEGIN_LINE, "begin-line", ARG_NON }, - { OP_END_LINE, "end-line", ARG_NON }, - { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, - { OP_BEGIN_POSITION, "begin-position", ARG_NON }, - { OP_BACKREF1, "backref1", ARG_NON }, - { OP_BACKREF2, "backref2", ARG_NON }, - { OP_BACKREF3, "backref3", ARG_NON }, - { OP_BACKREFN, "backrefn", ARG_MEMNUM }, - { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, - { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, - { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL }, - { OP_BACKREF_AT_LEVEL, "backref_at_level", ARG_SPECIAL }, - { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, - { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, - { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, - { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, - { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, - { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, - { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, - { OP_SET_OPTION, "set-option", ARG_OPTION }, - { OP_FAIL, "fail", ARG_NON }, - { OP_JUMP, "jump", ARG_RELADDR }, - { OP_PUSH, "push", ARG_RELADDR }, - { OP_POP, "pop", ARG_NON }, - { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, - { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, - { OP_REPEAT, "repeat", ARG_SPECIAL }, - { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, - { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, - { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, - { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, - { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, - { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM }, - { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, - { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, - { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, - { OP_PUSH_POS, "push-pos", ARG_NON }, - { OP_POP_POS, "pop-pos", ARG_NON }, - { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, - { OP_FAIL_POS, "fail-pos", ARG_NON }, - { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, - { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, - { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, + { OP_WORD, "word", ARG_NON }, + { OP_NOT_WORD, "not-word", ARG_NON }, + { OP_WORD_SB, "word-sb", ARG_NON }, + { OP_WORD_MB, "word-mb", ARG_NON }, + { OP_WORD_BOUND, "word-bound", ARG_NON }, + { OP_NOT_WORD_BOUND, "not-word-bound", ARG_NON }, + { OP_WORD_BEGIN, "word-begin", ARG_NON }, + { OP_WORD_END, "word-end", ARG_NON }, + { OP_BEGIN_BUF, "begin-buf", ARG_NON }, + { OP_END_BUF, "end-buf", ARG_NON }, + { OP_BEGIN_LINE, "begin-line", ARG_NON }, + { OP_END_LINE, "end-line", ARG_NON }, + { OP_SEMI_END_BUF, "semi-end-buf", ARG_NON }, + { OP_BEGIN_POSITION, "begin-position", ARG_NON }, + { OP_BACKREF1, "backref1", ARG_NON }, + { OP_BACKREF2, "backref2", ARG_NON }, + { OP_BACKREF3, "backref3", ARG_NON }, + { OP_BACKREFN, "backrefn", ARG_MEMNUM }, + { OP_BACKREFN_IC, "backrefn-ic", ARG_SPECIAL }, + { OP_BACKREF_MULTI, "backref_multi", ARG_SPECIAL }, + { OP_BACKREF_MULTI_IC, "backref_multi-ic", ARG_SPECIAL }, + { OP_BACKREF_AT_LEVEL, "backref_at_level", ARG_SPECIAL }, + { OP_MEMORY_START_PUSH, "mem-start-push", ARG_MEMNUM }, + { OP_MEMORY_START, "mem-start", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH, "mem-end-push", ARG_MEMNUM }, + { OP_MEMORY_END_PUSH_REC, "mem-end-push-rec", ARG_MEMNUM }, + { OP_MEMORY_END, "mem-end", ARG_MEMNUM }, + { OP_MEMORY_END_REC, "mem-end-rec", ARG_MEMNUM }, + { OP_SET_OPTION_PUSH, "set-option-push", ARG_OPTION }, + { OP_SET_OPTION, "set-option", ARG_OPTION }, + { OP_FAIL, "fail", ARG_NON }, + { OP_JUMP, "jump", ARG_RELADDR }, + { OP_PUSH, "push", ARG_RELADDR }, + { OP_POP, "pop", ARG_NON }, + { OP_PUSH_OR_JUMP_EXACT1, "push-or-jump-e1", ARG_SPECIAL }, + { OP_PUSH_IF_PEEK_NEXT, "push-if-peek-next", ARG_SPECIAL }, + { OP_REPEAT, "repeat", ARG_SPECIAL }, + { OP_REPEAT_NG, "repeat-ng", ARG_SPECIAL }, + { OP_REPEAT_INC, "repeat-inc", ARG_MEMNUM }, + { OP_REPEAT_INC_NG, "repeat-inc-ng", ARG_MEMNUM }, + { OP_REPEAT_INC_SG, "repeat-inc-sg", ARG_MEMNUM }, + { OP_REPEAT_INC_NG_SG, "repeat-inc-ng-sg", ARG_MEMNUM }, + { OP_NULL_CHECK_START, "null-check-start", ARG_MEMNUM }, + { OP_NULL_CHECK_END, "null-check-end", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST,"null-check-end-memst", ARG_MEMNUM }, + { OP_NULL_CHECK_END_MEMST_PUSH,"null-check-end-memst-push", ARG_MEMNUM }, + { OP_PUSH_POS, "push-pos", ARG_NON }, + { OP_POP_POS, "pop-pos", ARG_NON }, + { OP_PUSH_POS_NOT, "push-pos-not", ARG_RELADDR }, + { OP_FAIL_POS, "fail-pos", ARG_NON }, + { OP_PUSH_STOP_BT, "push-stop-bt", ARG_NON }, + { OP_POP_STOP_BT, "pop-stop-bt", ARG_NON }, + { OP_LOOK_BEHIND, "look-behind", ARG_SPECIAL }, { OP_PUSH_LOOK_BEHIND_NOT, "push-look-behind-not", ARG_SPECIAL }, { OP_FAIL_LOOK_BEHIND_NOT, "fail-look-behind-not", ARG_NON }, - { OP_CALL, "call", ARG_ABSADDR }, - { OP_RETURN, "return", ARG_NON }, + { OP_CALL, "call", ARG_ABSADDR }, + { OP_RETURN, "return", ARG_NON }, + { OP_STATE_CHECK_PUSH, "state-check-push", ARG_SPECIAL }, + { OP_STATE_CHECK_PUSH_OR_JUMP, "state-check-push-or-jump", ARG_SPECIAL }, + { OP_STATE_CHECK, "state-check", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_STAR, "state-check-anychar*", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_ML_STAR, + "state-check-anychar-ml*", ARG_STATE_CHECK }, + { OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT, + "state-check-anychar*-peek-next", ARG_SPECIAL }, + { OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT, + "state-check-anychar-ml*-peek-next", ARG_SPECIAL }, { -1, "", ARG_NON } }; @@ -5151,6 +5596,7 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, RelAddrType addr; LengthType len; MemNumType mem; + StateCheckNumType scn; OnigCodePoint code; UChar *q; @@ -5185,6 +5631,12 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, fprintf(f, ":%d", option); } break; + + case ARG_STATE_CHECK: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + fprintf(f, ":%d", scn); + break; } } else { @@ -5362,6 +5814,24 @@ onig_print_compiled_byte_code(FILE* f, UChar* bp, UChar** nextp, fprintf(f, ":%d:(%d)", len, addr); break; + case OP_STATE_CHECK_PUSH: + case OP_STATE_CHECK_PUSH_OR_JUMP: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + addr = *((RelAddrType* )bp); + bp += SIZE_RELADDR; + fprintf(f, ":%d:(%d)", scn, addr); + break; + + case OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT: + case OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT: + scn = *((StateCheckNumType* )bp); + bp += SIZE_STATE_CHECK_NUM; + fprintf(f, ":%d", scn); + p_string(f, 1, bp); + bp += 1; + break; + default: fprintf(stderr, "onig_print_compiled_byte_code: undefined code %d\n", *--bp); diff --git a/ext/mbstring/oniguruma/regexec.c b/ext/mbstring/oniguruma/regexec.c index 7a1a35a0e0..93662fea7f 100644 --- a/ext/mbstring/oniguruma/regexec.c +++ b/ext/mbstring/oniguruma/regexec.c @@ -306,6 +306,9 @@ typedef struct _StackType { UChar *pcode; /* byte code position */ UChar *pstr; /* string position */ UChar *pstr_prev; /* previous char position of pstr */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + unsigned int state_check; +#endif } state; struct { int count; /* for OP_REPEAT_INC, OP_REPEAT_INC_NG */ @@ -339,29 +342,28 @@ typedef struct _StackType { /* stack type */ /* used by normal-POP */ #define STK_ALT 0x0001 -#define STK_LOOK_BEHIND_NOT 0x0003 -#define STK_POS_NOT 0x0005 -/* avoided by normal-POP, but value should be small */ -#define STK_NULL_CHECK_START 0x0100 +#define STK_LOOK_BEHIND_NOT 0x0002 +#define STK_POS_NOT 0x0003 /* handled by normal-POP */ -#define STK_MEM_START 0x0200 -#define STK_MEM_END 0x0300 -#define STK_REPEAT_INC 0x0400 +#define STK_MEM_START 0x0100 +#define STK_MEM_END 0x8200 +#define STK_REPEAT_INC 0x0300 +#define STK_STATE_CHECK_MARK 0x1000 /* avoided by normal-POP */ +#define STK_NULL_CHECK_START 0x3000 +#define STK_NULL_CHECK_END 0x5000 /* for recursive call */ +#define STK_MEM_END_MARK 0x8400 #define STK_POS 0x0500 /* used when POP-POS */ #define STK_STOP_BT 0x0600 /* mark for "(?>...)" */ #define STK_REPEAT 0x0700 #define STK_CALL_FRAME 0x0800 #define STK_RETURN 0x0900 -#define STK_MEM_END_MARK 0x0a00 -#define STK_VOID 0x0b00 /* for fill a blank */ -#define STK_NULL_CHECK_END 0x0c00 /* for recursive call */ +#define STK_VOID 0x0a00 /* for fill a blank */ /* stack type check mask */ -#define STK_MASK_POP_USED 0x00ff -#define IS_TO_VOID_TARGET(stk) \ - (((stk)->type & STK_MASK_POP_USED) || \ - (stk)->type == STK_NULL_CHECK_START || (stk)->type == STK_NULL_CHECK_END) +#define STK_MASK_POP_USED 0x00ff +#define STK_MASK_TO_VOID_TARGET 0x10ff +#define STK_MASK_MEM_END_OR_MARK 0x8000 /* MEM_END or MEM_END_MARK */ typedef struct { void* stack_p; @@ -369,6 +371,10 @@ typedef struct { OnigOptionType options; OnigRegion* region; const UChar* start; /* search start position (for \G: BEGIN_POSITION) */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + void* state_check_buff; + int state_check_buff_size; +#endif } MatchArg; #define MATCH_ARG_INIT(msa, arg_option, arg_region, arg_start) do {\ @@ -378,7 +384,36 @@ typedef struct { (msa).start = (arg_start);\ } while (0) -#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +#ifdef USE_COMBINATION_EXPLOSION_CHECK + +#define STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE 16 + +#define STATE_CHECK_BUFF_INIT(msa, str_len, state_num) do { \ + (msa).state_check_buff = (void* )0;\ + if ((state_num) > 0 && str_len >= STATE_CHECK_STRING_THRESHOLD_LEN) {\ + int size = ((int )((str_len) + 1) * (state_num) + 7) / 8;\ + (msa).state_check_buff_size = size; \ + if (size > 0 && size < STATE_CHECK_BUFF_MAX_SIZE) {\ + if (size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) \ + (msa).state_check_buff = (void* )xmalloc(size);\ + else \ + (msa).state_check_buff = (void* )xalloca(size);\ + xmemset((msa).state_check_buff, 0, (size_t )size);\ + }\ + }\ +} while (0) + +#define MATCH_ARG_FREE(msa) do {\ + if ((msa).stack_p) xfree((msa).stack_p);\ + if ((msa).state_check_buff_size >= STATE_CHECK_BUFF_MALLOC_THRESHOLD_SIZE) { \ + if ((msa).state_check_buff) xfree((msa).state_check_buff);\ + }\ +} while (0); +#else +#define STATE_CHECK_BUFF_INIT(msa, str_len, state_num) +#define MATCH_ARG_FREE(msa) if ((msa).stack_p) xfree((msa).stack_p) +#endif + #define STACK_INIT(alloc_addr, ptr_num, stack_num) do {\ @@ -472,26 +507,88 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, #define STACK_AT(index) (stk_base + (index)) #define GET_STACK_INDEX(stk) ((stk) - stk_base) +#define STACK_PUSH_TYPE(stack_type) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + STACK_INC;\ +} while(0) + +#define IS_TO_VOID_TARGET(stk) (((stk)->type & STK_MASK_TO_VOID_TARGET) != 0) + +#ifdef USE_COMBINATION_EXPLOSION_CHECK +#define STATE_CHECK_POS(s,snum) \ + (((s) - str) * num_comb_exp_check + ((snum) - 1)) +#define STATE_CHECK_VAL(v,snum) do {\ + if (state_check_buff != NULL) {\ + int x = STATE_CHECK_POS(s,snum);\ + (v) = state_check_buff[x/8] & (1<<(x%8));\ + }\ + else (v) = 0;\ +} while(0) + + +#define ELSE_IF_STATE_CHECK_MARK(stk) \ + else if ((stk)->type == STK_STATE_CHECK_MARK) { \ + int x = STATE_CHECK_POS(stk->u.state.pstr, stk->u.state.state_check);\ + state_check_buff[x/8] |= (1<<(x%8)); \ + } + #define STACK_PUSH(stack_type,pat,s,sprev) do {\ STACK_ENSURE(1);\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ stk->u.state.pstr = (s);\ stk->u.state.pstr_prev = (sprev);\ + stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) #define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ stk->u.state.pcode = (pat);\ + stk->u.state.state_check = 0;\ STACK_INC;\ } while(0) -#define STACK_PUSH_TYPE(stack_type) do {\ +#define STACK_PUSH_ALT_WITH_STATE_CHECK(pat,s,sprev,snum) do {\ STACK_ENSURE(1);\ + stk->type = STK_ALT;\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + stk->u.state.state_check = ((state_check_buff != NULL) ? (snum) : 0);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_STATE_CHECK(s,snum) do {\ + if (state_check_buff != NULL) {\ + STACK_ENSURE(1);\ + stk->type = STK_STATE_CHECK_MARK;\ + stk->u.state.pstr = (s);\ + stk->u.state.state_check = (snum);\ + STACK_INC;\ + }\ +} while(0) + +#else /* USE_COMBINATION_EXPLOSION_CHECK */ + +#define ELSE_IF_STATE_CHECK_MARK(stk) + +#define STACK_PUSH(stack_type,pat,s,sprev) do {\ + STACK_ENSURE(1);\ + stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ + stk->u.state.pstr = (s);\ + stk->u.state.pstr_prev = (sprev);\ + STACK_INC;\ +} while(0) + +#define STACK_PUSH_ENSURED(stack_type,pat) do {\ stk->type = (stack_type);\ + stk->u.state.pcode = (pat);\ STACK_INC;\ } while(0) +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ #define STACK_PUSH_ALT(pat,s,sprev) STACK_PUSH(STK_ALT,pat,s,sprev) #define STACK_PUSH_POS(s,sprev) STACK_PUSH(STK_POS,NULL_UCHARP,s,sprev) @@ -551,7 +648,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, k = stk;\ while (k > stk_base) {\ k--;\ - if ((k->type == STK_MEM_END_MARK || k->type == STK_MEM_END) \ + if ((k->type & STK_MASK_MEM_END_OR_MARK) != 0 \ && k->u.mem.num == (mnum)) {\ level++;\ }\ @@ -631,6 +728,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, stk--;\ STACK_BASE_CHECK(stk, "STACK_POP"); \ if ((stk->type & STK_MASK_POP_USED) != 0) break;\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ case STACK_POP_LEVEL_MEM_START:\ @@ -642,6 +740,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ default:\ @@ -660,6 +759,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ break;\ }\ @@ -681,6 +781,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ } while(0) @@ -700,6 +801,7 @@ stack_double(StackType** arg_stk_base, StackType** arg_stk_end, mem_start_stk[stk->u.mem.num] = stk->u.mem.start;\ mem_end_stk[stk->u.mem.num] = stk->u.mem.end;\ }\ + ELSE_IF_STATE_CHECK_MARK(stk);\ }\ } while(0) @@ -947,6 +1049,7 @@ static int string_cmp_ic(OnigEncoding enc, int ambig_flag, is_fail = 0; \ } while(0) + #define ON_STR_BEGIN(s) ((s) == str) #define ON_STR_END(s) ((s) == end) #define IS_EMPTY_STR (str == end) @@ -1314,6 +1417,11 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, StackIndex si; StackIndex *repeat_stk; StackIndex *mem_start_stk, *mem_end_stk; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int scv; + unsigned char* state_check_buff = msa->state_check_buff; + int num_comb_exp_check = reg->num_comb_exp_check; +#endif n = reg->num_repeat + reg->num_mem * 2; STACK_INIT(alloca_base, n, INIT_MATCH_STACK_SIZE); @@ -1924,6 +2032,94 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, STAT_OP_OUT; break; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + case OP_STATE_CHECK_ANYCHAR_STAR: STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_STAR); + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; + } + STAT_OP_OUT; + break; + + case OP_STATE_CHECK_ANYCHAR_ML_STAR: + STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_ML_STAR); + + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_ALT_WITH_STATE_CHECK(p, s, sprev, mem); + n = enc_len(encode, s); + if (n > 1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + STAT_OP_OUT; + break; + + case OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT: + STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT); + + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + if (*p == *s) { + STACK_PUSH_ALT_WITH_STATE_CHECK(p + 1, s, sprev, mem); + } + n = enc_len(encode, s); + DATA_ENSURE(n); + if (ONIGENC_IS_MBC_NEWLINE(encode, s, end)) goto fail; + sprev = s; + s += n; + } + p++; + STAT_OP_OUT; + break; + + case OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT: + STAT_OP_IN(OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT); + + GET_STATE_CHECK_NUM_INC(mem, p); + while (s < end) { + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + if (*p == *s) { + STACK_PUSH_ALT_WITH_STATE_CHECK(p + 1, s, sprev, mem); + } + n = enc_len(encode, s); + if (n >1) { + DATA_ENSURE(n); + sprev = s; + s += n; + } + else { + sprev = s; + s++; + } + } + p++; + STAT_OP_OUT; + break; +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ + case OP_WORD: STAT_OP_IN(OP_WORD); DATA_ENSURE(1); if (! ONIGENC_IS_MBC_WORD(encode, s, end)) @@ -2451,6 +2647,43 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, continue; break; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + case OP_STATE_CHECK_PUSH: STAT_OP_IN(OP_STATE_CHECK_PUSH); + GET_STATE_CHECK_NUM_INC(mem, p); + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + GET_RELADDR_INC(addr, p); + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + STAT_OP_OUT; + continue; + break; + + case OP_STATE_CHECK_PUSH_OR_JUMP: STAT_OP_IN(OP_STATE_CHECK_PUSH_OR_JUMP); + GET_STATE_CHECK_NUM_INC(mem, p); + GET_RELADDR_INC(addr, p); + STATE_CHECK_VAL(scv, mem); + if (scv) { + p += addr; + } + else { + STACK_PUSH_ALT_WITH_STATE_CHECK(p + addr, s, sprev, mem); + } + STAT_OP_OUT; + continue; + break; + + case OP_STATE_CHECK: STAT_OP_IN(OP_STATE_CHECK); + GET_STATE_CHECK_NUM_INC(mem, p); + STATE_CHECK_VAL(scv, mem); + if (scv) goto fail; + + STACK_PUSH_STATE_CHECK(s, mem); + STAT_OP_OUT; + continue; + break; +#endif /* USE_COMBINATION_EXPLOSION_CHECK */ + case OP_POP: STAT_OP_IN(OP_POP); STACK_POP_ONE; STAT_OP_OUT; @@ -2525,7 +2758,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, repeat_inc: stkp->u.repeat.count++; - if (stkp->u.repeat.count == reg->repeat_range[mem].upper) { + if (stkp->u.repeat.count >= reg->repeat_range[mem].upper) { /* end of repeat. Nothing to do. */ } else if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { @@ -2555,8 +2788,7 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, repeat_inc_ng: stkp->u.repeat.count++; - if (stkp->u.repeat.count < reg->repeat_range[mem].upper || - IS_REPEAT_INFINITE(reg->repeat_range[mem].upper)) { + if (stkp->u.repeat.count < reg->repeat_range[mem].upper) { if (stkp->u.repeat.count >= reg->repeat_range[mem].lower) { UChar* pcode = stkp->u.repeat.pcode; @@ -2685,6 +2917,14 @@ match_at(regex_t* reg, const UChar* str, const UChar* end, const UChar* sstart, p = stk->u.state.pcode; s = stk->u.state.pstr; sprev = stk->u.state.pstr_prev; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + if (stk->u.state.state_check != 0) { + stk->type = STK_STATE_CHECK_MARK; + stk++; + } +#endif + STAT_OP_OUT; continue; break; @@ -2869,66 +3109,56 @@ bm_search_notrev(regex_t* reg, const UChar* target, const UChar* target_end, const UChar* text, const UChar* text_end, const UChar* text_range) { - const UChar *s, *t, *p, *end; + const UChar *s, *se, *t, *p, *end; const UChar *tail; - int skip; + int skip, tlen1; #ifdef ONIG_DEBUG_SEARCH fprintf(stderr, "bm_search_notrev: text: %d, text_end: %d, text_range: %d\n", (int )text, (int )text_end, (int )text_range); #endif - end = text_range + (target_end - target) - 1; + tlen1 = (target_end - target) - 1; + end = text_range + tlen1; if (end > text_end) end = text_end; tail = target_end - 1; s = text; - while ((s - text) < target_end - target) { - s += enc_len(reg->enc, s); - } - s--; /* set to text check tail position. */ if (IS_NULL(reg->int_map)) { while (s < end) { - p = s; + p = se = s + tlen1; t = tail; - while (t >= target && *p == *t) { - p--; t--; + while (*p == *t && t >= target) { + p--; t--; } - if (t < target) return (UChar* )(p + 1); + if (t < target) return (UChar* )s; - skip = reg->map[*s]; - p = s + 1; - if (p >= text_end) return (UChar* )NULL; - t = p; + skip = reg->map[*se]; + t = s; do { - p += enc_len(reg->enc, p); - } while ((p - t) < skip && p < text_end); - - s += (p - t); + s += enc_len(reg->enc, s); + } while ((s - t) < skip && s < end); } } else { while (s < end) { - p = s; + p = se = s + tlen1; t = tail; - while (t >= target && *p == *t) { - p--; t--; + while (*p == *t && t >= target) { + p--; t--; } - if (t < target) return (UChar* )(p + 1); + if (t < target) return (UChar* )s; - skip = reg->int_map[*s]; - p = s + 1; - if (p >= text_end) return (UChar* )NULL; - t = p; + skip = reg->int_map[*se]; + t = s; do { - p += enc_len(reg->enc, p); - } while ((p - t) < skip && p < text_end); - - s += (p - t); + s += enc_len(reg->enc, s); + } while ((s - t) < skip && s < end); } } + return (UChar* )NULL; } @@ -3083,6 +3313,7 @@ onig_match(regex_t* reg, const UChar* str, const UChar* end, const UChar* at, On #endif /* USE_RECOMPILE_API && USE_MULTI_THREAD_SYSTEM */ MATCH_ARG_INIT(msa, option, region, at); + STATE_CHECK_BUFF_INIT(msa, end - str, reg->num_comb_exp_check); if (region #ifdef USE_POSIX_REGION_OPTION @@ -3343,6 +3574,7 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, int r; UChar *s, *prev; MatchArg msa; + const UChar *orig_start = start; #if defined(USE_RECOMPILE_API) && defined(USE_MULTI_THREAD_SYSTEM) start: @@ -3484,6 +3716,9 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, prev = (UChar* )NULL; MATCH_ARG_INIT(msa, option, region, start); +#ifdef USE_COMBINATION_EXPLOSION_CHECK + msa.state_check_buff = (void* )0; +#endif MATCH_AND_RETURN_CHECK; goto mismatch; } @@ -3495,7 +3730,8 @@ onig_search(regex_t* reg, const UChar* str, const UChar* end, (int )(end - str), (int )(start - str), (int )(range - str)); #endif - MATCH_ARG_INIT(msa, option, region, start); + MATCH_ARG_INIT(msa, option, region, orig_start); + STATE_CHECK_BUFF_INIT(msa, end - str, reg->num_comb_exp_check); s = (UChar* )start; if (range > start) { /* forward search */ diff --git a/ext/mbstring/oniguruma/regint.h b/ext/mbstring/oniguruma/regint.h index 4c4341c616..c9f494e44e 100644 --- a/ext/mbstring/oniguruma/regint.h +++ b/ext/mbstring/oniguruma/regint.h @@ -59,7 +59,7 @@ /* #define USE_UNICODE_FULL_RANGE_CTYPE */ /* --> move to regenc.h */ #define USE_NAMED_GROUP #define USE_SUBEXP_CALL -#define USE_BACKREF_AT_LEVEL +#define USE_COMBINATION_EXPLOSION_CHECK /* (X*)* */ #define USE_INFINITE_REPEAT_MONOMANIAC_MEM_STATUS_CHECK /* /(?:()|())*\2/ */ #define USE_NEWLINE_AT_END_OF_STRING_HAS_EMPTY_LINE /* /\n$/ =~ "\n" */ #define USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR @@ -82,6 +82,7 @@ /* interface to external system */ #ifdef NOT_RUBY /* given from Makefile */ #include "config.h" +#define USE_BACKREF_AT_LEVEL #define USE_CAPTURE_HISTORY #define USE_VARIABLE_META_CHARS #define USE_WORD_BEGIN_END /* "\<": word-begin, "\>": word-end */ @@ -117,6 +118,9 @@ #endif /* else NOT_RUBY */ +#define STATE_CHECK_STRING_THRESHOLD_LEN 7 +#define STATE_CHECK_BUFF_MAX_SIZE 0x08000000 + #define THREAD_PASS_LIMIT_COUNT 8 #define xmemset memset #define xmemcpy memcpy @@ -639,34 +643,35 @@ enum OpCode { OP_FAIL_LOOK_BEHIND_NOT, /* (? */ - OP_RETURN + OP_RETURN, + + OP_STATE_CHECK_PUSH, /* combination explosion check and push */ + OP_STATE_CHECK_PUSH_OR_JUMP, /* check ok -> push, else jump */ + OP_STATE_CHECK, /* check only */ + OP_STATE_CHECK_ANYCHAR_STAR, + OP_STATE_CHECK_ANYCHAR_ML_STAR, + OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT, + OP_STATE_CHECK_ANYCHAR_ML_STAR_PEEK_NEXT }; -/* arguments type */ -#define ARG_SPECIAL -1 -#define ARG_NON 0 -#define ARG_RELADDR 1 -#define ARG_ABSADDR 2 -#define ARG_LENGTH 3 -#define ARG_MEMNUM 4 -#define ARG_OPTION 5 - typedef int RelAddrType; typedef int AbsAddrType; typedef int LengthType; typedef int RepeatNumType; typedef short int MemNumType; +typedef short int StateCheckNumType; typedef void* PointerType; -#define SIZE_OPCODE 1 -#define SIZE_RELADDR sizeof(RelAddrType) -#define SIZE_ABSADDR sizeof(AbsAddrType) -#define SIZE_LENGTH sizeof(LengthType) -#define SIZE_MEMNUM sizeof(MemNumType) -#define SIZE_REPEATNUM sizeof(RepeatNumType) -#define SIZE_OPTION sizeof(OnigOptionType) -#define SIZE_CODE_POINT sizeof(OnigCodePoint) -#define SIZE_POINTER sizeof(PointerType) +#define SIZE_OPCODE 1 +#define SIZE_RELADDR sizeof(RelAddrType) +#define SIZE_ABSADDR sizeof(AbsAddrType) +#define SIZE_LENGTH sizeof(LengthType) +#define SIZE_MEMNUM sizeof(MemNumType) +#define SIZE_STATE_CHECK_NUM sizeof(StateCheckNumType) +#define SIZE_REPEATNUM sizeof(RepeatNumType) +#define SIZE_OPTION sizeof(OnigOptionType) +#define SIZE_CODE_POINT sizeof(OnigCodePoint) +#define SIZE_POINTER sizeof(PointerType) #ifdef PLATFORM_UNALIGNED_WORD_ACCESS @@ -692,6 +697,7 @@ typedef void* PointerType; #define GET_REPEATNUM_INC(num,p) PLATFORM_GET_INC(num, p, RepeatNumType) #define GET_OPTION_INC(option,p) PLATFORM_GET_INC(option, p, OnigOptionType) #define GET_POINTER_INC(ptr,p) PLATFORM_GET_INC(ptr, p, PointerType) +#define GET_STATE_CHECK_NUM_INC(num,p) PLATFORM_GET_INC(num, p, StateCheckNumType) /* code point's address must be aligned address. */ #define GET_CODE_POINT(code,p) code = *((OnigCodePoint* )(p)) @@ -734,6 +740,13 @@ typedef void* PointerType; #define SIZE_OP_CALL (SIZE_OPCODE + SIZE_ABSADDR) #define SIZE_OP_RETURN SIZE_OPCODE +#ifdef USE_COMBINATION_EXPLOSION_CHECK +#define SIZE_OP_STATE_CHECK (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) +#define SIZE_OP_STATE_CHECK_PUSH (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) +#define SIZE_OP_STATE_CHECK_PUSH_OR_JUMP (SIZE_OPCODE + SIZE_STATE_CHECK_NUM + SIZE_RELADDR) +#define SIZE_OP_STATE_CHECK_ANYCHAR_STAR (SIZE_OPCODE + SIZE_STATE_CHECK_NUM) +#define SIZE_OP_STATE_CHECK_ANYCHAR_STAR_PEEK_NEXT (SIZE_OPCODE + 1 + SIZE_STATE_CHECK_NUM) +#endif #define MC_ESC(enc) (enc)->meta_char_table.esc #define MC_ANYCHAR(enc) (enc)->meta_char_table.anychar diff --git a/ext/mbstring/oniguruma/regparse.c b/ext/mbstring/oniguruma/regparse.c index d70dbb6c3b..407b73fc44 100644 --- a/ext/mbstring/oniguruma/regparse.c +++ b/ext/mbstring/oniguruma/regparse.c @@ -940,6 +940,13 @@ scan_env_clear(ScanEnv* env) for (i = 0; i < SCANENV_MEMNODES_SIZE; i++) env->mem_nodes_static[i] = NULL_NODE; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + env->num_comb_exp_check = 0; + env->comb_exp_max_regnum = 0; + env->curr_max_regnum = 0; + env->has_recursion = 0; +#endif } static int @@ -1321,11 +1328,17 @@ node_new_qualifier(int lower, int upper, int by_number) NQUALIFIER(node).lower = lower; NQUALIFIER(node).upper = upper; NQUALIFIER(node).greedy = 1; - NQUALIFIER(node).by_number = by_number; NQUALIFIER(node).target_empty_info = NQ_TARGET_ISNOT_EMPTY; NQUALIFIER(node).head_exact = NULL_NODE; NQUALIFIER(node).next_head_exact = NULL_NODE; NQUALIFIER(node).is_refered = 0; + if (by_number != 0) + NQUALIFIER(node).state |= NST_BY_NUMBER; + +#ifdef USE_COMBINATION_EXPLOSION_CHECK + NQUALIFIER(node).comb_exp_check_num = 0; +#endif + return node; } @@ -2140,7 +2153,7 @@ enum ReduceType { RQ_AQ, /* to '*?' */ RQ_QQ, /* to '??' */ RQ_P_QQ, /* to '+)??' */ - RQ_PQ_Q, /* to '+?)?' */ + RQ_PQ_Q /* to '+?)?' */ }; static enum ReduceType ReduceTypeTable[6][6] = { @@ -4633,16 +4646,14 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) { /* check redundant double repeat. */ /* verbose warn (?:.?)? etc... but not warn (.?)? etc... */ QualifierNode* qnt = &(NQUALIFIER(target)); + int nestq_num = popular_qualifier_num(qn); + int targetq_num = popular_qualifier_num(qnt); #ifdef USE_WARNING_REDUNDANT_NESTED_REPEAT_OPERATOR - if (qn->by_number == 0 && qnt->by_number == 0 && + if (!IS_QUALIFIER_BY_NUMBER(qn) && !IS_QUALIFIER_BY_NUMBER(qnt) && IS_SYNTAX_BV(env->syntax, ONIG_SYN_WARN_REDUNDANT_NESTED_REPEAT)) { - int nestq_num, targetq_num; UChar buf[WARN_BUFSIZE]; - nestq_num = popular_qualifier_num(qn); - targetq_num = popular_qualifier_num(qnt); - switch(ReduceTypeTable[targetq_num][nestq_num]) { case RQ_ASIS: break; @@ -4673,9 +4684,17 @@ set_qualifier(Node* qnode, Node* target, int group, ScanEnv* env) warn_exit: #endif - if (popular_qualifier_num(qnt) >= 0 && popular_qualifier_num(qn) >= 0) { - onig_reduce_nested_qualifier(qnode, target); - goto q_exit; + if (targetq_num >= 0) { + if (nestq_num >= 0) { + onig_reduce_nested_qualifier(qnode, target); + goto q_exit; + } + else if (targetq_num == 1 || targetq_num == 2) { /* * or + */ + /* (?:a*){n,m}, (?:a+){n,m} => (?:a*){n,n}, (?:a+){n,n} */ + if (! IS_REPEAT_INFINITE(qn->upper) && qn->upper > 1 && qn->greedy) { + qn->upper = (qn->lower == 0 ? 1 : qn->lower); + } + } } } break; diff --git a/ext/mbstring/oniguruma/regparse.h b/ext/mbstring/oniguruma/regparse.h index 0958c909bf..ca62dddf7e 100644 --- a/ext/mbstring/oniguruma/regparse.h +++ b/ext/mbstring/oniguruma/regparse.h @@ -124,11 +124,13 @@ typedef struct { int lower; int upper; int greedy; - int by_number; /* {n,m} */ int target_empty_info; struct _Node* head_exact; struct _Node* next_head_exact; int is_refered; /* include called node. don't eliminate even if {0} */ +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int comb_exp_check_num; /* 1,2,3...: check, 0: no check */ +#endif } QualifierNode; /* status bits */ @@ -146,6 +148,7 @@ typedef struct { #define NST_NAME_REF (1<<11) #define NST_IN_REPEAT (1<<12) /* STK_REPEAT is nested in stack. */ #define NST_NEST_LEVEL (1<<13) +#define NST_BY_NUMBER (1<<14) /* {n,m} */ #define SET_EFFECT_STATUS(node,f) (node)->u.effect.state |= (f) #define CLEAR_EFFECT_STATUS(node,f) (node)->u.effect.state &= ~(f) @@ -168,6 +171,7 @@ typedef struct { #define IS_BACKREF_NAME_REF(bn) (((bn)->state & NST_NAME_REF) != 0) #define IS_BACKREF_NEST_LEVEL(bn) (((bn)->state & NST_NEST_LEVEL) != 0) #define IS_QUALIFIER_IN_REPEAT(qn) (((qn)->state & NST_IN_REPEAT) != 0) +#define IS_QUALIFIER_BY_NUMBER(qn) (((qn)->state & NST_BY_NUMBER) != 0) typedef struct { int state; @@ -277,6 +281,12 @@ typedef struct { int mem_alloc; Node* mem_nodes_static[SCANENV_MEMNODES_SIZE]; Node** mem_nodes_dynamic; +#ifdef USE_COMBINATION_EXPLOSION_CHECK + int num_comb_exp_check; + int comb_exp_max_regnum; + int curr_max_regnum; + int has_recursion; +#endif } ScanEnv; diff --git a/ext/mbstring/oniguruma/regposix.c b/ext/mbstring/oniguruma/regposix.c index fa7b5c4b24..a3bacf722e 100644 --- a/ext/mbstring/oniguruma/regposix.c +++ b/ext/mbstring/oniguruma/regposix.c @@ -2,7 +2,7 @@ regposix.c - Oniguruma (regular expression library) **********************************************************************/ /*- - * Copyright (c) 2002-2005 K.Kosako + * Copyright (c) 2002-2006 K.Kosako * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -192,7 +192,7 @@ regexec(regex_t* reg, const char* str, size_t nmatch, ENC_STRING_LEN(ONIG_C(reg)->enc, str, len); end = (UChar* )(str + len); r = onig_search(ONIG_C(reg), (UChar* )str, end, (UChar* )str, end, - (OnigRegion* )pmatch, options); + (OnigRegion* )pm, options); if (r >= 0) { r = 0; /* Match */ @@ -212,6 +212,11 @@ regexec(regex_t* reg, const char* str, size_t nmatch, if (pm != pmatch && pm != NULL) xfree(pm); +#if 0 + if (reg->re_nsub > nmatch - 1) + reg->re_nsub = (nmatch <= 1 ? 0 : nmatch - 1); +#endif + return r; }