From: Justin Erenkrantz Date: Thu, 6 Sep 2001 23:58:29 +0000 (+0000) Subject: Make find_start_sequence use the BNDM search algorithm. We handle X-Git-Tag: 2.0.26~303 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=e0b1ef661777382ef8344729bce10fa4b4b0c142;p=apache Make find_start_sequence use the BNDM search algorithm. We handle edge cases via the old slow mechanism. Previously, find_start_sequence would be responsible for ~25% of the usr CPU time in tests (as performed by Ian). No more. git-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@90940 13f79535-47bb-0310-9956-ffa450edef68 --- diff --git a/CHANGES b/CHANGES index 685a96dc56..7180816b91 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,8 @@ Changes with Apache 2.0.26-dev + *) Rewrite find_start_sequence to use a better search algorithm + to find the start tag. [Justin Erenkrantz] + *) Fix a seg fault in mod_include. When we are generating an internal redirect, we must set r->uri to "", not a bogus string, and not NULL. [Ryan Bloom] diff --git a/modules/filters/mod_include.c b/modules/filters/mod_include.c index be44b12f99..3b67a9332b 100644 --- a/modules/filters/mod_include.c +++ b/modules/filters/mod_include.c @@ -206,6 +206,113 @@ static const char *get_include_var(request_rec *r, const char *var) /* --------------------------- Parser functions --------------------------- */ +/* This is an implementation of the BNDM search algorithm. + * + * Fast and Flexible String Matching by Combining Bit-parallelism and + * Suffix Automata (2001) + * Gonzalo Navarro, Mathieu Raffinot + * + * http://www-igm.univ-mlv.fr/~raffinot/ftp/jea2001.ps.gz + * + * Initial code submitted by Sascha Schumann. + */ + +typedef struct { + unsigned int T[256]; + unsigned int x; +} bndm_t; + +/* This is the pattern matcher that holds the STARTING_SEQUENCE bndm_t + * structure. + */ +static bndm_t start_seq_pat; + +/* Precompile the bndm_t data structure. */ +static void bndm_compile(bndm_t *t, const char *n, apr_size_t nl) +{ + unsigned int x; + const char *ne = n + nl; + + memset(t->T, 0, sizeof(unsigned int) * 256); + + for (x = 1; n < ne; x <<= 1) + t->T[(unsigned char) *n++] |= x; + + t->x = x - 1; +} + +/* Implements the BNDM search algorithm (as described above). + * + * n - the pattern to search for + * nl - length of the pattern to search for + * h - the string to look in + * hl - length of the string to look for + * t - precompiled bndm structure against the pattern + * + * Returns the count of character that is the first match or hl if no + * match is found. + */ +static int bndm(const char *n, apr_size_t nl, const char *h, apr_size_t hl, + bndm_t *t) +{ + apr_size_t skip; + const char *he, *p, *pi; + unsigned int *T, x, d; + + he = h + hl; + + T = t->T; + x = t->x << 1; + + pi = h - 1; /* pi: p initial */ + p = pi + nl; /* compare window right to left. point to the first char */ + + while (p < he) { + skip = nl; + d = x; + do { + d = (d >> 1) & T[(unsigned char) *p--]; + if ((d & 1)) { + if (p != pi) + skip = p - pi; + else + return p - h + 1; + } + } while (d > 1); + p = (pi += skip) + nl; + } + + return hl; +} + +/* We've now found a start sequence tag... */ +static apr_bucket* found_start_sequence(apr_bucket *dptr, + include_ctx_t *ctx, + int tagStart) +{ + /* We want to split the bucket at the '<'. */ + ctx->state = PARSE_DIRECTIVE; + ctx->tag_length = 0; + ctx->parse_pos = 0; + ctx->tag_start_bucket = dptr; + ctx->tag_start_index = tagStart; + if (ctx->head_start_index > 0) { + apr_bucket *tmp_bkt; + + /* Split the bucket with the start of the tag in it */ + apr_bucket_split(ctx->head_start_bucket, ctx->head_start_index); + tmp_bkt = APR_BUCKET_NEXT(ctx->head_start_bucket); + /* If it was a one bucket match */ + if (dptr == ctx->head_start_bucket) { + ctx->tag_start_bucket = tmp_bkt; + ctx->tag_start_index = tagStart - ctx->head_start_index; + } + ctx->head_start_bucket = tmp_bkt; + ctx->head_start_index = 0; + } + return ctx->head_start_bucket; +} + /* This function returns either a pointer to the split bucket containing the * first byte of the BEGINNING_SEQUENCE (after finding a complete match) or it * returns NULL if no match found. @@ -217,8 +324,8 @@ static apr_bucket *find_start_sequence(apr_bucket *dptr, include_ctx_t *ctx, const char *c; const char *buf; const char *str = STARTING_SEQUENCE; - apr_bucket *tmp_bkt; - apr_size_t start_index; + int slen = sizeof(STARTING_SEQUENCE) - 1; + int pos; *do_cleanup = 0; @@ -269,8 +376,53 @@ static apr_bucket *find_start_sequence(apr_bucket *dptr, include_ctx_t *ctx, if (len == 0) { /* end of pipe? */ break; } + + /* Set our buffer to use. */ c = buf; - while (c < buf + len) { + + /* The last bucket had a left over partial match that we need to + * complete. + */ + if (ctx->state == PARSE_HEAD) + { + apr_size_t tmpLen; + tmpLen = (len > (slen - 1)) ? len : (slen - 1); + + while (c < buf + tmpLen && *c == str[ctx->parse_pos]) + { + c++; + ctx->parse_pos++; + } + + if (str[ctx->parse_pos] == '\0') + { + ctx->bytes_parsed += c - buf; + return found_start_sequence(dptr, ctx, c - buf); + } + + /* False alarm... */ + ctx->state = PRE_HEAD; + } + + if (len) + { + pos = bndm(str, slen, c, len, &start_seq_pat); + if (pos != len) + { + ctx->head_start_bucket = dptr; + ctx->head_start_index = pos; + ctx->bytes_parsed += pos + slen; + return found_start_sequence(dptr, ctx, pos + slen); + } + } + + /* Consider the case where we have