]> granicus.if.org Git - php/commitdiff
Reimplement strtr()
authorDmitry Stogov <dmitry@zend.com>
Thu, 24 Apr 2014 15:14:29 +0000 (19:14 +0400)
committerDmitry Stogov <dmitry@zend.com>
Thu, 24 Apr 2014 15:14:29 +0000 (19:14 +0400)
Zend/zend_hash.c
Zend/zend_hash.h
ext/standard/string.c

index ffe71a679125ee66a2b2458e82cec440953fa23e..bc882dd8dbfa26b07d16841bec21fddbea35798e 100644 (file)
@@ -313,6 +313,15 @@ ZEND_API zval *_zend_hash_str_add_or_update(HashTable *ht, const char *str, int
        return ret;
 }
 
+ZEND_API zval *zend_hash_index_add_empty_element(HashTable *ht, ulong h)
+{
+       
+       zval dummy;
+
+       ZVAL_NULL(&dummy);
+       return zend_hash_index_add(ht, h, &dummy);
+}
+
 ZEND_API zval *zend_hash_add_empty_element(HashTable *ht, zend_string *key)
 {
        
index 829348d388d4284558c222a3a0fd0e891ff772cc..54575c23f1f7240306b16e7b5cbfc2f8ea87cfec 100644 (file)
@@ -96,6 +96,7 @@ ZEND_API zval *_zend_hash_index_update_or_next_insert(HashTable *ht, ulong h, zv
 #define zend_hash_next_index_insert(ht, pData) \
                _zend_hash_index_update_or_next_insert(ht, 0, pData, HASH_NEXT_INSERT ZEND_FILE_LINE_CC)
 
+ZEND_API zval *zend_hash_index_add_empty_element(HashTable *ht, ulong h);
 ZEND_API zval *zend_hash_add_empty_element(HashTable *ht, zend_string *key);
 ZEND_API zval *zend_hash_str_add_empty_element(HashTable *ht, const char *key, int len);
 
@@ -569,6 +570,10 @@ static inline void *zend_hash_get_current_data_ptr_ex(HashTable *ht, HashPositio
        ZEND_HASH_FOREACH(ht, 0); \
        _ptr = Z_PTR_P(_z);
 
+#define ZEND_HASH_FOREACH_NUM_KEY(ht, _h) \
+       ZEND_HASH_FOREACH(ht, 0); \
+       _h = _p->h;
+
 #define ZEND_HASH_FOREACH_STR_KEY(ht, _key) \
        ZEND_HASH_FOREACH(ht, 0); \
        _key = _p->key;
index a960cd1a5c81ff4c2be76a0e3c02359ac8e04ad6..3315b98ee703030d5deea419db51f33a84a114c8 100644 (file)
@@ -2760,320 +2760,174 @@ PHPAPI char *php_strtr(char *str, int len, char *str_from, char *str_to, int trl
 }
 /* }}} */
 
-/* {{{ Definitions for php_strtr_array */
-typedef size_t STRLEN; /* STRLEN should be unsigned */
-typedef uint16_t HASH;
-typedef struct {
-       HASH                    table_mask;
-       STRLEN                  entries[1];
-} SHIFT_TAB;
-typedef struct {
-       HASH                    table_mask;
-       int                             entries[1];
-} HASH_TAB;
-typedef struct {
-       const char      *s;
-       STRLEN          l;
-} STR;
-typedef struct _pat_and_repl {
-       STR                     pat;
-       STR                     repl;
-} PATNREPL;
-
-#define S(a) ((a)->s)
-#define L(a) ((a)->l)
-
-#define SHIFT_TAB_BITS 13
-#define HASH_TAB_BITS  10 /* should be less than sizeof(HASH) * 8 */
-#define SHIFT_TAB_SIZE (1U << SHIFT_TAB_BITS)
-#define HASH_TAB_SIZE  (1U << HASH_TAB_BITS)
-
-typedef struct {
-       int                             B;                      /* size of suffixes */
-       int                             Bp;                     /* size of prefixes */
-       STRLEN                  m;                      /* minimum pattern length */
-       int                             patnum;         /* number of patterns */
-       SHIFT_TAB               *shift;         /* table mapping hash to allowed shift */
-       HASH_TAB                *hash;          /* table mapping hash to int (pair of pointers) */
-       HASH                    *prefix;        /* array of hashes of prefixes by pattern suffix hash order */
-       PATNREPL                *patterns;      /* array of prefixes by pattern suffix hash order */
-} PPRES;
-/* }}} */
-
-/* {{{ php_strtr_hash */
-static inline HASH php_strtr_hash(const char *str, int len)
-{
-       HASH    res = 0;
-       int             i;
-       for (i = 0; i < len; i++) {
-               res = res * 33 + (unsigned char)str[i];
-       }
-
-       return res;
-}
-/* }}} */
-/* {{{ php_strtr_populate_shift */
-static inline void php_strtr_populate_shift(PATNREPL *patterns, int patnum, int B, STRLEN m, SHIFT_TAB *shift)
+static int php_strtr_key_compare(const void *a, const void *b TSRMLS_DC) /* {{{ */
 {
-       int             i;
-       STRLEN  j,
-                       max_shift;
-
-       max_shift = m - B + 1;
-       for (i = 0; i < SHIFT_TAB_SIZE; i++) {
-               shift->entries[i] = max_shift;
-       }
-       for (i = 0; i < patnum; i++) {
-               for (j = 0; j < m - B + 1; j++) {
-                       HASH h = php_strtr_hash(&S(&patterns[i].pat)[j], B) & shift->table_mask;
-                       assert((long long) m - (long long) j - B >= 0);
-                       shift->entries[h] = MIN(shift->entries[h], m - j - B);
-               }
-       }
-}
-/* }}} */
-/* {{{ php_strtr_compare_hash_suffix */
-static int php_strtr_compare_hash_suffix(const void *a, const void *b TSRMLS_DC, void *ctx_g)
-{
-       const PPRES             *res = ctx_g;
-       const PATNREPL  *pnr_a = a,
-                                       *pnr_b = b;
-       HASH                    hash_a = php_strtr_hash(&S(&pnr_a->pat)[res->m - res->B], res->B)
-                                                               & res->hash->table_mask,
-                                       hash_b = php_strtr_hash(&S(&pnr_b->pat)[res->m - res->B], res->B)
-                                                               & res->hash->table_mask;
-       /* TODO: don't recalculate the hashes all the time */
-       if (hash_a > hash_b) {
-               return 1;
-       } else if (hash_a < hash_b) {
-               return -1;
-       } else {
-               /* longer patterns must be sorted first */
-               if (L(&pnr_a->pat) > L(&pnr_b->pat)) {
-                       return -1;
-               } else if (L(&pnr_a->pat) < L(&pnr_b->pat)) {
-                       return 1;
-               } else {
-                       return 0;
-               }
-       }
-}
-/* }}} */
-/* {{{ php_strtr_free_strp */
-static void php_strtr_free_strp(void *strp)
-{
-       STR_RELEASE(*(zend_string **)strp);
-}
-/* }}} */
-/* {{{ php_strtr_array_prepare_repls */
-static PATNREPL *php_strtr_array_prepare_repls(int slen, HashTable *pats, zend_llist **allocs, int *outsize)
-{
-       PATNREPL                *patterns;
-       zval                    *entry;
-       int                             num_pats = zend_hash_num_elements(pats),
-                                       i = 0;
-       zend_string        *string_key;
-       ulong           num_key;
-
-       patterns = safe_emalloc(num_pats, sizeof(*patterns), 0);
-       *allocs = emalloc(sizeof **allocs);
-       zend_llist_init(*allocs, sizeof(zend_string*), &php_strtr_free_strp, 0);
-
-       ZEND_HASH_FOREACH_KEY_VAL(pats, num_key, string_key, entry) {
-               zval    tzv;
-
-               if (!string_key) {
-                       char buf[MAX_LENGTH_OF_LONG];
-                       int len = snprintf(buf, sizeof(buf), "%ld", num_key);
-                       string_key = STR_INIT(buf, len, 0);
-                       zend_llist_add_element(*allocs, &string_key);
-               }
-               if (string_key->len == 0) { /* empty string given as pattern */
-                       efree(patterns);
-                       zend_llist_destroy(*allocs);
-                       efree(*allocs);
-                       *allocs = NULL;
-                       return NULL;
-               }
-               if (string_key->len > slen) { /* this pattern can never match */
-                       continue;
-               }
-
-               if (Z_TYPE_P(entry) != IS_STRING) {
-                       ZVAL_DUP(&tzv, entry);
-                       convert_to_string(&tzv);
-                       entry = &tzv;
-                       zend_llist_add_element(*allocs, &Z_STR_P(entry));
-               }
+       Bucket *f = (Bucket *) a;
+       Bucket *s = (Bucket *) b;
 
-               S(&patterns[i].pat) = string_key->val;
-               L(&patterns[i].pat) = string_key->len;
-               S(&patterns[i].repl) = Z_STRVAL_P(entry);
-               L(&patterns[i].repl) = Z_STRLEN_P(entry);
-               i++;
-       } ZEND_HASH_FOREACH_END();
-
-       *outsize = i;
-       return patterns;
+       return f->h > s->h ? -1 : 1;
 }
 /* }}} */
 
-/* {{{ PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp) */
-static PPRES *php_strtr_array_prepare(STR *text, PATNREPL *patterns, int patnum, int B, int Bp)
+/* {{{ php_strtr_array */
+static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *pats TSRMLS_DC)
 {
-       int             i;
-       PPRES   *res = emalloc(sizeof *res);
-
-       res->m = (STRLEN)-1;
-       for (i = 0; i < patnum; i++) {
-               if (L(&patterns[i].pat) < res->m) {
-                       res->m = L(&patterns[i].pat);
-               }
-       }
-       assert(res->m > 0);
-       res->B  = B             = MIN(B, res->m);
-       res->Bp = Bp    = MIN(Bp, res->m);
-
-       res->shift = safe_emalloc(SHIFT_TAB_SIZE, sizeof(*res->shift->entries), sizeof(*res->shift));
-       res->shift->table_mask = SHIFT_TAB_SIZE - 1;
-       php_strtr_populate_shift(patterns, patnum, B, res->m, res->shift);
-
-       res->hash = safe_emalloc(HASH_TAB_SIZE, sizeof(*res->hash->entries), sizeof(*res->hash));
-       res->hash->table_mask = HASH_TAB_SIZE - 1;
-
-       res->patterns = safe_emalloc(patnum, sizeof(*res->patterns), 0);
-       memcpy(res->patterns, patterns, sizeof(*patterns) * patnum);
-#ifdef ZTS
-       zend_qsort_r(res->patterns, patnum, sizeof(*res->patterns),
-                       php_strtr_compare_hash_suffix, res, NULL); /* tsrmls not needed */
-#else
-       zend_qsort_r(res->patterns, patnum, sizeof(*res->patterns),
-                       php_strtr_compare_hash_suffix, res);
-#endif
-
-       res->prefix = safe_emalloc(patnum, sizeof(*res->prefix), 0);
-       for (i = 0; i < patnum; i++) {
-               res->prefix[i] = php_strtr_hash(S(&res->patterns[i].pat), Bp);
-       }
-
-       /* Initialize the rest of ->hash */
-       for (i = 0; i < HASH_TAB_SIZE; i++) {
-               res->hash->entries[i] = -1;
-       }
-       {
-               HASH last_h = -1; /* assumes not all bits are used in res->hash */
-               /* res->patterns is already ordered by hash.
-                * Make res->hash->entries[h] de index of the first pattern in
-                * res->patterns that has hash h */
-               for (i = 0; i < patnum; i++) {
-                       HASH h = php_strtr_hash(&S(&res->patterns[i].pat)[res->m - res->B], res->B)
-                                               & res->hash->table_mask;
-                       if (h != last_h) {
-                               res->hash->entries[h] = i;
-                               last_h = h;
+       ulong num_key;
+       zend_string *str_key;
+       int len, pos, found;
+       int num_keys = 0;
+       int minlen = 128*1024;
+       int maxlen = 0;
+       HashTable str_hash, num_hash;
+       zval *entry, tmp, dummy;
+       char *key;
+       smart_str result = {0};
+
+       /* we will collect all possible key lenghts */
+       ZVAL_NULL(&dummy);
+       zend_hash_init(&num_hash, 8, NULL, NULL, 0);
+
+       /* check if original array has numeric keys */
+       ZEND_HASH_FOREACH_KEY(pats, num_key, str_key) {
+               if (UNEXPECTED(!str_key)) {
+                       num_keys = 1;
+               } else {
+                       len = str_key->len;
+                       if (UNEXPECTED(len < 1)) {
+                               RETURN_FALSE;
+                       } else if (UNEXPECTED(len > slen)) {
+                               /* skip long patterns */
+                               continue;
                        }
+                       if (len > maxlen) {
+                               maxlen = len;
+                       }
+                       if (len < minlen) {
+                               minlen = len;
+                       }
+                       /* remember possible key lenght */
+                       zend_hash_index_add(&num_hash, len, &dummy);
                }
-       }
-       res->hash->entries[HASH_TAB_SIZE] = patnum; /* OK, we effectively allocated SIZE+1 */
-       for (i = HASH_TAB_SIZE - 1; i >= 0; i--) {
-               if (res->hash->entries[i] == -1) {
-                       res->hash->entries[i] = res->hash->entries[i + 1];
-               }
-       }
-
-       res->patnum     = patnum;
-
-       return res;
-}
-/* }}} */
-/* {{{ php_strtr_array_destroy_ppres(PPRES *d) */
-static void php_strtr_array_destroy_ppres(PPRES *d)
-{
-       efree(d->shift);
-       efree(d->hash);
-       efree(d->prefix);
-       efree(d->patterns);
-       efree(d);
-}
-/* }}} */
-
-/* {{{ php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value) */
-static void php_strtr_array_do_repl(STR *text, PPRES *d, zval *return_value TSRMLS_DC)
-{
-       STRLEN          pos = 0,
-                               nextwpos = 0,
-                               lastpos = L(text) - d->m;
-       smart_str       result = {0};
-
-       while (pos <= lastpos) {
-               HASH    h               = php_strtr_hash(&S(text)[pos + d->m - d->B], d->B) & d->shift->table_mask;
-               STRLEN  shift   = d->shift->entries[h];
-
-               if (shift > 0) {
-                       pos += shift;
-               } else {
-                       HASH    h2                              = h & d->hash->table_mask,
-                                       prefix_h                = php_strtr_hash(&S(text)[pos], d->Bp);
-
-                       int             offset_start    = d->hash->entries[h2],
-                                       offset_end              = d->hash->entries[h2 + 1], /* exclusive */
-                                       i                               = 0;
+       } ZEND_HASH_FOREACH_END();
 
-                       for (i = offset_start; i < offset_end; i++) {
-                               PATNREPL *pnr;
-                               if (d->prefix[i] != prefix_h)
+       if (num_keys) {
+               /* we have to rebuild HashTable with numeric keys */
+               zend_hash_init(&str_hash, zend_hash_num_elements(pats), NULL, NULL, 0);
+               ZEND_HASH_FOREACH_KEY_VAL(pats, num_key, str_key, entry) {
+                       if (UNEXPECTED(!str_key)) {
+                               ZVAL_LONG(&tmp, num_key);
+                               convert_to_string(&tmp);
+                               str_key = Z_STR(tmp);
+                               len = str_key->len;
+                               if (UNEXPECTED(len > slen)) {
+                                       /* skip long patterns */
+                                       zval_dtor(&tmp);
                                        continue;
-
-                               pnr = &d->patterns[i];
-                               if (L(&pnr->pat) > L(text) - pos ||
-                                               memcmp(S(&pnr->pat), &S(text)[pos], L(&pnr->pat)) != 0)
+                               }
+                               if (len > maxlen) {
+                                       maxlen = len;
+                               }
+                               if (len < minlen) {
+                                       minlen = len;
+                               }
+                               /* remember possible key lenght */
+                               zend_hash_index_add(&num_hash, len, &dummy);
+                       } else {
+                               len = str_key->len;
+                               if (UNEXPECTED(len > slen)) {
+                                       /* skip long patterns */
                                        continue;
-
-                               smart_str_appendl(&result, &S(text)[nextwpos], pos - nextwpos);
-                               smart_str_appendl(&result, S(&pnr->repl), L(&pnr->repl));
-                               pos += L(&pnr->pat);
-                               nextwpos = pos;
-                               goto end_outer_loop;
+                               }
                        }
-
-                       pos++;
-end_outer_loop: ;
-               }
+                       zend_hash_add(&str_hash, str_key, entry);
+                       if (str_key == Z_STR(tmp)) {
+                               zval_dtor(&tmp);
+                       }
+               } ZEND_HASH_FOREACH_END();
+               pats = &str_hash;
        }
 
-       smart_str_appendl(&result, &S(text)[nextwpos], L(text) - nextwpos);
-
-       if (result.s) {
-               smart_str_0(&result);
-               RETURN_STR(result.s);
+       if (UNEXPECTED(minlen > maxlen)) {
+               /* return the original string */
+               if (pats == &str_hash) {
+                       zend_hash_destroy(&str_hash);
+               }
+               zend_hash_destroy(&num_hash);
+               RETURN_STRINGL(str, slen);
+       }
+       /* select smart or simple algorithm */
+       // TODO: tune the condition ???
+       len = zend_hash_num_elements(&num_hash);
+       if ((maxlen - (minlen - 1) - len > 0) &&
+               /* smart algorithm, sort key lengths first */
+               zend_hash_sort(&num_hash, zend_qsort, php_strtr_key_compare, 0 TSRMLS_CC) == SUCCESS) {
+
+               pos = 0;
+               while (pos <= slen - minlen) {
+                       found = 0;
+                       key = str + pos;
+                       ZEND_HASH_FOREACH_NUM_KEY(&num_hash, len) {
+                               if (len > slen - pos) continue;
+                               entry = zend_hash_str_find(pats, key, len);
+                               if (entry != NULL) {
+                                       if (UNEXPECTED(Z_TYPE_P(entry) != IS_STRING)) {
+                                               ZVAL_DUP(&tmp, entry);
+                                               convert_to_string(&tmp);
+                                               entry = &tmp;
+                                       }
+                                       smart_str_appendl(&result, Z_STRVAL_P(entry), Z_STRLEN_P(entry));
+                                       pos += len;
+                                       if (entry == &tmp) {
+                                               zval_dtor(&tmp);
+                                       }
+                                       found = 1;
+                                       break;
+                               } 
+                       } ZEND_HASH_FOREACH_END();
+                       if (!found) {
+                               smart_str_appendc(&result, str[pos++]);
+                       }
+               }
+               smart_str_appendl(&result, str + pos, slen - pos);
        } else {
-               RETURN_EMPTY_STRING();
+               /* use simple algorithm */
+               pos = 0;
+               while (pos <= slen - minlen) {
+                       if (maxlen > slen - pos) {
+                               maxlen = slen - pos;
+                       }
+                       found = 0;
+                       key = str + pos;
+                       for (len = maxlen; len >= minlen; len--) {
+                               entry = zend_hash_str_find(pats, key, len);
+                               if (entry != NULL) {
+                                       if (UNEXPECTED(Z_TYPE_P(entry) != IS_STRING)) {
+                                               ZVAL_DUP(&tmp, entry);
+                                               convert_to_string(&tmp);
+                                               entry = &tmp;
+                                       }
+                                       smart_str_appendl(&result, Z_STRVAL_P(entry), Z_STRLEN_P(entry));
+                                       pos += len;
+                                       if (entry == &tmp) {
+                                               zval_dtor(&tmp);
+                                       }
+                                       found = 1;
+                                       break;
+                               } 
+                       }
+                       if (!found) {
+                               smart_str_appendc(&result, str[pos++]);
+                       }
+               }
+               smart_str_appendl(&result, str + pos, slen - pos);
        }
-}
-/* }}} */
 
-/* {{{ php_strtr_array */
-static void php_strtr_array(zval *return_value, char *str, int slen, HashTable *pats TSRMLS_DC)
-{
-       PPRES           *data;
-       STR                     text;
-       PATNREPL        *patterns;
-       int                     patterns_len;
-       zend_llist      *allocs;
-
-       S(&text) = str;
-       L(&text) = slen;
-
-       patterns = php_strtr_array_prepare_repls(slen, pats, &allocs, &patterns_len);
-       if (patterns == NULL) {
-               RETURN_FALSE;
+       if (pats == &str_hash) {
+               zend_hash_destroy(&str_hash);
        }
-       data = php_strtr_array_prepare(&text, patterns, patterns_len, 2, 2);
-       efree(patterns);
-       php_strtr_array_do_repl(&text, data, return_value TSRMLS_CC);
-       php_strtr_array_destroy_ppres(data);
-       zend_llist_destroy(allocs);
-       efree(allocs);
+       zend_hash_destroy(&num_hash);
+       smart_str_0(&result);
+       RETURN_STR(result.s);
 }
 /* }}} */