Flatten strip_tags state machine (GCC overoptimized this function, producing enormpou...
authorDmitry Stogov <dmitry@zend.com>
Mon, 16 Jul 2018 10:22:17 +0000 (13:22 +0300)
committerDmitry Stogov <dmitry@zend.com>
Mon, 16 Jul 2018 10:22:17 +0000 (13:22 +0300)
ext/standard/string.c

index 91f60219f750f07ee8e4b4aa47e98a56769fb8e4..792bd914e4dac2a5248af68b057dff7daa4b1106 100644 (file)
@@ -5002,19 +5002,16 @@ PHPAPI size_t php_strip_tags(char *rbuf, size_t len, uint8_t *stateptr, const ch
 PHPAPI size_t php_strip_tags_ex(char *rbuf, size_t len, uint8_t *stateptr, const char *allow, size_t allow_len, zend_bool allow_tag_spaces)
 {
        char *tbuf, *tp, *rp, c, lc;
-       const char *buf, *p;
+       const char *buf, *p, *end;
        int br, depth=0, in_q = 0;
        uint8_t state = 0;
-       size_t pos, i = 0;
+       size_t pos;
        char *allow_free = NULL;
        const char *allow_actual;
        char is_xml = 0;
 
-       if (stateptr)
-               state = *stateptr;
-
        buf = estrndup(rbuf, len);
-       c = *buf;
+       end = buf + len;
        lc = '\0';
        p = buf;
        rp = rbuf;
@@ -5028,237 +5025,294 @@ PHPAPI size_t php_strip_tags_ex(char *rbuf, size_t len, uint8_t *stateptr, const
                tbuf = tp = NULL;
        }
 
-       while (i < len) {
-               switch (c) {
-                       case '\0':
+       if (stateptr) {
+               state = *stateptr;
+               switch (state) {
+                       case 1: goto state_1;
+                       case 2: goto state_2;
+                       case 3: goto state_3;
+                       case 4: goto state_4;
+                       default:
                                break;
-                       case '<':
-                               if (in_q) {
-                                       break;
-                               }
-                               if (isspace(*(p + 1)) && !allow_tag_spaces) {
-                                       goto reg_char;
-                               }
-                               if (state == 0) {
-                                       lc = '<';
-                                       state = 1;
-                                       if (allow) {
-                                               if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                                       pos = tp - tbuf;
-                                                       tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                                       tp = tbuf + pos;
-                                               }
-                                               *(tp++) = '<';
-                                       }
-                               } else if (state == 1) {
-                                       depth++;
+               }
+       }
+
+state_0:
+       if (p >= end) {
+               goto finish;
+       }
+       c = *p;
+       switch (c) {
+               case '\0':
+                       break;
+               case '<':
+                       if (in_q) {
+                               break;
+                       }
+                       if (isspace(*(p + 1)) && !allow_tag_spaces) {
+                               *(rp++) = c;
+                               break;
+                       }
+                       lc = '<';
+                       state = 1;
+                       if (allow) {
+                               if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
+                                       pos = tp - tbuf;
+                                       tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
+                                       tp = tbuf + pos;
                                }
+                               *(tp++) = '<';
+                       }
+                       p++;
+                       goto state_1;
+               case '>':
+                       if (depth) {
+                               depth--;
                                break;
+                       }
 
-                       case '(':
-                               if (state == 2) {
-                                       if (lc != '"' && lc != '\'') {
-                                               lc = '(';
-                                               br++;
-                                       }
-                               } else if (allow && state == 1) {
-                                       if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                               pos = tp - tbuf;
-                                               tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                               tp = tbuf + pos;
-                                       }
-                                       *(tp++) = c;
-                               } else if (state == 0) {
-                                       *(rp++) = c;
-                               }
+                       if (in_q) {
                                break;
+                       }
 
-                       case ')':
-                               if (state == 2) {
-                                       if (lc != '"' && lc != '\'') {
-                                               lc = ')';
-                                               br--;
-                                       }
-                               } else if (allow && state == 1) {
-                                       if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                               pos = tp - tbuf;
-                                               tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                               tp = tbuf + pos;
-                                       }
-                                       *(tp++) = c;
-                               } else if (state == 0) {
-                                       *(rp++) = c;
-                               }
+                       *(rp++) = c;
+                       break;
+               default:
+                       *(rp++) = c;
+                       break;
+       }
+       p++;
+       goto state_0;
+
+state_1:
+       if (p >= end) {
+               goto finish;
+       }
+       c = *p;
+       switch (c) {
+               case '\0':
+                       break;
+               case '<':
+                       if (in_q) {
+                               break;
+                       }
+                       if (isspace(*(p + 1)) && !allow_tag_spaces) {
+                               goto reg_char_1;
+                       }
+                       depth++;
+                       break;
+               case '>':
+                       if (depth) {
+                               depth--;
+                               break;
+                       }
+                       if (in_q) {
                                break;
+                       }
 
-                       case '>':
-                               if (depth) {
-                                       depth--;
-                                       break;
+                       lc = '>';
+                       if (is_xml && *(p -1) == '-') {
+                               break;
+                       }
+                       in_q = state = is_xml = 0;
+                       if (allow) {
+                               if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
+                                       pos = tp - tbuf;
+                                       tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
+                                       tp = tbuf + pos;
                                }
-
+                               *(tp++) = '>';
+                               *tp='\0';
+                               if (php_tag_find(tbuf, tp-tbuf, allow_actual)) {
+                                       memcpy(rp, tbuf, tp-tbuf);
+                                       rp += tp-tbuf;
+                               }
+                               tp = tbuf;
+                       }
+                       p++;
+                       goto state_0;
+               case '"':
+               case '\'':
+                       if (p != buf && (!in_q || *p == in_q)) {
                                if (in_q) {
-                                       break;
+                                       in_q = 0;
+                               } else {
+                                       in_q = *p;
                                }
-
-                               switch (state) {
-                                       case 1: /* HTML/XML */
-                                               lc = '>';
-                                               if (is_xml && *(p -1) == '-') {
-                                                       break;
-                                               }
-                                               in_q = state = is_xml = 0;
-                                               if (allow) {
-                                                       if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                                               pos = tp - tbuf;
-                                                               tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                                               tp = tbuf + pos;
-                                                       }
-                                                       *(tp++) = '>';
-                                                       *tp='\0';
-                                                       if (php_tag_find(tbuf, tp-tbuf, allow_actual)) {
-                                                               memcpy(rp, tbuf, tp-tbuf);
-                                                               rp += tp-tbuf;
-                                                       }
-                                                       tp = tbuf;
-                                               }
-                                               break;
-
-                                       case 2: /* PHP */
-                                               if (!br && lc != '\"' && *(p-1) == '?') {
-                                                       in_q = state = 0;
-                                                       tp = tbuf;
-                                               }
-                                               break;
-
-                                       case 3:
-                                               in_q = state = 0;
-                                               tp = tbuf;
-                                               break;
-
-                                       case 4: /* JavaScript/CSS/etc... */
-                                               if (p >= buf + 2 && *(p-1) == '-' && *(p-2) == '-') {
-                                                       in_q = state = 0;
-                                                       tp = tbuf;
-                                               }
-                                               break;
-
-                                       default:
-                                               *(rp++) = c;
-                                               break;
+                       }
+                       goto reg_char_1;
+               case '!':
+                       /* JavaScript & Other HTML scripting languages */
+                       if (*(p-1) == '<') {
+                               state = 3;
+                               lc = c;
+                               p++;
+                               goto state_3;
+                       } else {
+                               goto reg_char_1;
+                       }
+                       break;
+               case '?':
+                       if (*(p-1) == '<') {
+                               br=0;
+                               state = 2;
+                               p++;
+                               goto state_2;
+                       } else {
+                               goto reg_char_1;
+                       }
+                       break;
+               default:
+reg_char_1:
+                       if (allow) {
+                               if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
+                                       pos = tp - tbuf;
+                                       tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
+                                       tp = tbuf + pos;
                                }
+                               *(tp++) = c;
+                       }
+                       break;
+       }
+       p++;
+       goto state_1;
+
+state_2:
+       if (p >= end) {
+               goto finish;
+       }
+       c = *p;
+       switch (c) {
+               case '(':
+                       if (lc != '"' && lc != '\'') {
+                               lc = '(';
+                               br++;
+                       }
+                       break;
+               case ')':
+                       if (lc != '"' && lc != '\'') {
+                               lc = ')';
+                               br--;
+                       }
+                       break;
+               case '>':
+                       if (in_q) {
                                break;
+                       }
 
-                       case '"':
-                       case '\'':
-                               if (state == 4) {
-                                       /* Inside <!-- comment --> */
-                                       break;
-                               } else if (state == 2 && *(p-1) != '\\') {
-                                       if (lc == c) {
-                                               lc = '\0';
-                                       } else if (lc != '\\') {
-                                               lc = c;
-                                       }
-                               } else if (state == 0) {
-                                       *(rp++) = c;
-                               } else if (allow && state == 1) {
-                                       if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                               pos = tp - tbuf;
-                                               tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                               tp = tbuf + pos;
-                                       }
-                                       *(tp++) = c;
+                       if (!br && lc != '\"' && *(p-1) == '?') {
+                               in_q = state = 0;
+                               tp = tbuf;
+                               p++;
+                               goto state_0;
+                       }
+                       break;
+               case '"':
+               case '\'':
+                       if (*(p-1) != '\\') {
+                               if (lc == c) {
+                                       lc = '\0';
+                               } else if (lc != '\\') {
+                                       lc = c;
                                }
-                               if (state && p != buf && (state == 1 || *(p-1) != '\\') && (!in_q || *p == in_q)) {
+                       } else {
+                               if (p != buf && *(p-1) != '\\' && (!in_q || *p == in_q)) {
                                        if (in_q) {
                                                in_q = 0;
                                        } else {
                                                in_q = *p;
                                        }
                                }
-                               break;
+                       }
+                       break;
+               case 'l':
+               case 'L':
+                       /* swm: If we encounter '<?xml' then we shouldn't be in
+                        * state == 2 (PHP). Switch back to HTML.
+                        */
+                       if (state == 2 && p > buf+4
+                                    && (*(p-1) == 'm' || *(p-1) == 'M')
+                                    && (*(p-2) == 'x' || *(p-2) == 'X')
+                                    && *(p-3) == '?'
+                                    && *(p-4) == '<') {
+                               state = 1; is_xml=1;
+                               p++;
+                               goto state_1;
+                       }
+                       break;
+               default:
+                       break;
+       }
+       p++;
+       goto state_2;
 
-                       case '!':
-                               /* JavaScript & Other HTML scripting languages */
-                               if (state == 1 && *(p-1) == '<') {
-                                       state = 3;
-                                       lc = c;
-                               } else {
-                                       if (state == 0) {
-                                               *(rp++) = c;
-                                       } else if (allow && state == 1) {
-                                               if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                                       pos = tp - tbuf;
-                                                       tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                                       tp = tbuf + pos;
-                                               }
-                                               *(tp++) = c;
-                                       }
-                               }
+state_3:
+       if (p >= end) {
+               goto finish;
+       }
+       c = *p;
+       switch (c) {
+               case '>':
+                       if (in_q) {
                                break;
-
-                       case '-':
-                               if (state == 3 && p >= buf + 2 && *(p-1) == '-' && *(p-2) == '!') {
-                                       state = 4;
+                       }
+                       in_q = state = 0;
+                       tp = tbuf;
+                       p++;
+                       goto state_0;
+               case '"':
+               case '\'':
+                       if (p != buf && *(p-1) != '\\' && (!in_q || *p == in_q)) {
+                               if (in_q) {
+                                       in_q = 0;
                                } else {
-                                       goto reg_char;
-                               }
-                               break;
-
-                       case '?':
-
-                               if (state == 1 && *(p-1) == '<') {
-                                       br=0;
-                                       state=2;
-                                       break;
-                               }
-
-                       case 'E':
-                       case 'e':
-                               /* !DOCTYPE exception */
-                               if (state==3 && p > buf+6
-                                                    && tolower(*(p-1)) == 'p'
-                                                && tolower(*(p-2)) == 'y'
-                                                    && tolower(*(p-3)) == 't'
-                                                    && tolower(*(p-4)) == 'c'
-                                                    && tolower(*(p-5)) == 'o'
-                                                    && tolower(*(p-6)) == 'd') {
-                                       state = 1;
-                                       break;
-                               }
-                               /* fall-through */
-
-                       case 'l':
-                       case 'L':
-
-                               /* swm: If we encounter '<?xml' then we shouldn't be in
-                                * state == 2 (PHP). Switch back to HTML.
-                                */
-
-                               if (state == 2 && p > buf+4 && strncasecmp(p-4, "<?xm", 4) == 0) {
-                                       state = 1; is_xml=1;
-                                       break;
+                                       in_q = *p;
                                }
+                       }
+                       break;
+               case '-':
+                       if (p >= buf + 2 && *(p-1) == '-' && *(p-2) == '!') {
+                               state = 4;
+                               p++;
+                               goto state_4;
+                       }
+                       break;
+               case 'E':
+               case 'e':
+                       /* !DOCTYPE exception */
+                       if (p > buf+6
+                            && (*(p-1) == 'p' || *(p-1) == 'P')
+                            && (*(p-2) == 'y' || *(p-2) == 'Y')
+                            && (*(p-3) == 't' || *(p-3) == 'T')
+                            && (*(p-4) == 'c' || *(p-4) == 'C')
+                            && (*(p-5) == 'o' || *(p-5) == 'O')
+                            && (*(p-6) == 'd' || *(p-6) == 'D')) {
+                               state = 1;
+                               p++;
+                               goto state_1;
+                       }
+                       break;
+               default:
+                       break;
+       }
+       p++;
+       goto state_3;
 
-                               /* fall-through */
-                       default:
-reg_char:
-                               if (state == 0) {
-                                       *(rp++) = c;
-                               } else if (allow && state == 1) {
-                                       if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
-                                               pos = tp - tbuf;
-                                               tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
-                                               tp = tbuf + pos;
-                                       }
-                                       *(tp++) = c;
-                               }
-                               break;
+state_4:
+       while (p < end) {
+               c = *p;
+               if (c == '>' && !in_q) {
+                       if (p >= buf + 2 && *(p-1) == '-' && *(p-2) == '-') {
+                               in_q = state = 0;
+                               tp = tbuf;
+                               p++;
+                               goto state_0;
+                       }
                }
-               c = *(++p);
-               i++;
+               p++;
        }
+
+finish:
        if (rp < rbuf + len) {
                *rp = '\0';
        }