PHPAPI size_t php_strip_tags_ex(char *rbuf, size_t len, uint8_t *stateptr, const char *allow, size_t allow_len, zend_bool allow_tag_spaces)
{
char *tbuf, *tp, *rp, c, lc;
- const char *buf, *p;
+ const char *buf, *p, *end;
int br, depth=0, in_q = 0;
uint8_t state = 0;
- size_t pos, i = 0;
+ size_t pos;
char *allow_free = NULL;
const char *allow_actual;
char is_xml = 0;
- if (stateptr)
- state = *stateptr;
-
buf = estrndup(rbuf, len);
- c = *buf;
+ end = buf + len;
lc = '\0';
p = buf;
rp = rbuf;
tbuf = tp = NULL;
}
- while (i < len) {
- switch (c) {
- case '\0':
+ if (stateptr) {
+ state = *stateptr;
+ switch (state) {
+ case 1: goto state_1;
+ case 2: goto state_2;
+ case 3: goto state_3;
+ case 4: goto state_4;
+ default:
break;
- case '<':
- if (in_q) {
- break;
- }
- if (isspace(*(p + 1)) && !allow_tag_spaces) {
- goto reg_char;
- }
- if (state == 0) {
- lc = '<';
- state = 1;
- if (allow) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = '<';
- }
- } else if (state == 1) {
- depth++;
+ }
+ }
+
+state_0:
+ if (p >= end) {
+ goto finish;
+ }
+ c = *p;
+ switch (c) {
+ case '\0':
+ break;
+ case '<':
+ if (in_q) {
+ break;
+ }
+ if (isspace(*(p + 1)) && !allow_tag_spaces) {
+ *(rp++) = c;
+ break;
+ }
+ lc = '<';
+ state = 1;
+ if (allow) {
+ if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
+ pos = tp - tbuf;
+ tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
+ tp = tbuf + pos;
}
+ *(tp++) = '<';
+ }
+ p++;
+ goto state_1;
+ case '>':
+ if (depth) {
+ depth--;
break;
+ }
- case '(':
- if (state == 2) {
- if (lc != '"' && lc != '\'') {
- lc = '(';
- br++;
- }
- } else if (allow && state == 1) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = c;
- } else if (state == 0) {
- *(rp++) = c;
- }
+ if (in_q) {
break;
+ }
- case ')':
- if (state == 2) {
- if (lc != '"' && lc != '\'') {
- lc = ')';
- br--;
- }
- } else if (allow && state == 1) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = c;
- } else if (state == 0) {
- *(rp++) = c;
- }
+ *(rp++) = c;
+ break;
+ default:
+ *(rp++) = c;
+ break;
+ }
+ p++;
+ goto state_0;
+
+state_1:
+ if (p >= end) {
+ goto finish;
+ }
+ c = *p;
+ switch (c) {
+ case '\0':
+ break;
+ case '<':
+ if (in_q) {
+ break;
+ }
+ if (isspace(*(p + 1)) && !allow_tag_spaces) {
+ goto reg_char_1;
+ }
+ depth++;
+ break;
+ case '>':
+ if (depth) {
+ depth--;
+ break;
+ }
+ if (in_q) {
break;
+ }
- case '>':
- if (depth) {
- depth--;
- break;
+ lc = '>';
+ if (is_xml && *(p -1) == '-') {
+ break;
+ }
+ in_q = state = is_xml = 0;
+ if (allow) {
+ if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
+ pos = tp - tbuf;
+ tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
+ tp = tbuf + pos;
}
-
+ *(tp++) = '>';
+ *tp='\0';
+ if (php_tag_find(tbuf, tp-tbuf, allow_actual)) {
+ memcpy(rp, tbuf, tp-tbuf);
+ rp += tp-tbuf;
+ }
+ tp = tbuf;
+ }
+ p++;
+ goto state_0;
+ case '"':
+ case '\'':
+ if (p != buf && (!in_q || *p == in_q)) {
if (in_q) {
- break;
+ in_q = 0;
+ } else {
+ in_q = *p;
}
-
- switch (state) {
- case 1: /* HTML/XML */
- lc = '>';
- if (is_xml && *(p -1) == '-') {
- break;
- }
- in_q = state = is_xml = 0;
- if (allow) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = '>';
- *tp='\0';
- if (php_tag_find(tbuf, tp-tbuf, allow_actual)) {
- memcpy(rp, tbuf, tp-tbuf);
- rp += tp-tbuf;
- }
- tp = tbuf;
- }
- break;
-
- case 2: /* PHP */
- if (!br && lc != '\"' && *(p-1) == '?') {
- in_q = state = 0;
- tp = tbuf;
- }
- break;
-
- case 3:
- in_q = state = 0;
- tp = tbuf;
- break;
-
- case 4: /* JavaScript/CSS/etc... */
- if (p >= buf + 2 && *(p-1) == '-' && *(p-2) == '-') {
- in_q = state = 0;
- tp = tbuf;
- }
- break;
-
- default:
- *(rp++) = c;
- break;
+ }
+ goto reg_char_1;
+ case '!':
+ /* JavaScript & Other HTML scripting languages */
+ if (*(p-1) == '<') {
+ state = 3;
+ lc = c;
+ p++;
+ goto state_3;
+ } else {
+ goto reg_char_1;
+ }
+ break;
+ case '?':
+ if (*(p-1) == '<') {
+ br=0;
+ state = 2;
+ p++;
+ goto state_2;
+ } else {
+ goto reg_char_1;
+ }
+ break;
+ default:
+reg_char_1:
+ if (allow) {
+ if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
+ pos = tp - tbuf;
+ tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
+ tp = tbuf + pos;
}
+ *(tp++) = c;
+ }
+ break;
+ }
+ p++;
+ goto state_1;
+
+state_2:
+ if (p >= end) {
+ goto finish;
+ }
+ c = *p;
+ switch (c) {
+ case '(':
+ if (lc != '"' && lc != '\'') {
+ lc = '(';
+ br++;
+ }
+ break;
+ case ')':
+ if (lc != '"' && lc != '\'') {
+ lc = ')';
+ br--;
+ }
+ break;
+ case '>':
+ if (in_q) {
break;
+ }
- case '"':
- case '\'':
- if (state == 4) {
- /* Inside <!-- comment --> */
- break;
- } else if (state == 2 && *(p-1) != '\\') {
- if (lc == c) {
- lc = '\0';
- } else if (lc != '\\') {
- lc = c;
- }
- } else if (state == 0) {
- *(rp++) = c;
- } else if (allow && state == 1) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = c;
+ if (!br && lc != '\"' && *(p-1) == '?') {
+ in_q = state = 0;
+ tp = tbuf;
+ p++;
+ goto state_0;
+ }
+ break;
+ case '"':
+ case '\'':
+ if (*(p-1) != '\\') {
+ if (lc == c) {
+ lc = '\0';
+ } else if (lc != '\\') {
+ lc = c;
}
- if (state && p != buf && (state == 1 || *(p-1) != '\\') && (!in_q || *p == in_q)) {
+ } else {
+ if (p != buf && *(p-1) != '\\' && (!in_q || *p == in_q)) {
if (in_q) {
in_q = 0;
} else {
in_q = *p;
}
}
- break;
+ }
+ break;
+ case 'l':
+ case 'L':
+ /* swm: If we encounter '<?xml' then we shouldn't be in
+ * state == 2 (PHP). Switch back to HTML.
+ */
+ if (state == 2 && p > buf+4
+ && (*(p-1) == 'm' || *(p-1) == 'M')
+ && (*(p-2) == 'x' || *(p-2) == 'X')
+ && *(p-3) == '?'
+ && *(p-4) == '<') {
+ state = 1; is_xml=1;
+ p++;
+ goto state_1;
+ }
+ break;
+ default:
+ break;
+ }
+ p++;
+ goto state_2;
- case '!':
- /* JavaScript & Other HTML scripting languages */
- if (state == 1 && *(p-1) == '<') {
- state = 3;
- lc = c;
- } else {
- if (state == 0) {
- *(rp++) = c;
- } else if (allow && state == 1) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = c;
- }
- }
+state_3:
+ if (p >= end) {
+ goto finish;
+ }
+ c = *p;
+ switch (c) {
+ case '>':
+ if (in_q) {
break;
-
- case '-':
- if (state == 3 && p >= buf + 2 && *(p-1) == '-' && *(p-2) == '!') {
- state = 4;
+ }
+ in_q = state = 0;
+ tp = tbuf;
+ p++;
+ goto state_0;
+ case '"':
+ case '\'':
+ if (p != buf && *(p-1) != '\\' && (!in_q || *p == in_q)) {
+ if (in_q) {
+ in_q = 0;
} else {
- goto reg_char;
- }
- break;
-
- case '?':
-
- if (state == 1 && *(p-1) == '<') {
- br=0;
- state=2;
- break;
- }
-
- case 'E':
- case 'e':
- /* !DOCTYPE exception */
- if (state==3 && p > buf+6
- && tolower(*(p-1)) == 'p'
- && tolower(*(p-2)) == 'y'
- && tolower(*(p-3)) == 't'
- && tolower(*(p-4)) == 'c'
- && tolower(*(p-5)) == 'o'
- && tolower(*(p-6)) == 'd') {
- state = 1;
- break;
- }
- /* fall-through */
-
- case 'l':
- case 'L':
-
- /* swm: If we encounter '<?xml' then we shouldn't be in
- * state == 2 (PHP). Switch back to HTML.
- */
-
- if (state == 2 && p > buf+4 && strncasecmp(p-4, "<?xm", 4) == 0) {
- state = 1; is_xml=1;
- break;
+ in_q = *p;
}
+ }
+ break;
+ case '-':
+ if (p >= buf + 2 && *(p-1) == '-' && *(p-2) == '!') {
+ state = 4;
+ p++;
+ goto state_4;
+ }
+ break;
+ case 'E':
+ case 'e':
+ /* !DOCTYPE exception */
+ if (p > buf+6
+ && (*(p-1) == 'p' || *(p-1) == 'P')
+ && (*(p-2) == 'y' || *(p-2) == 'Y')
+ && (*(p-3) == 't' || *(p-3) == 'T')
+ && (*(p-4) == 'c' || *(p-4) == 'C')
+ && (*(p-5) == 'o' || *(p-5) == 'O')
+ && (*(p-6) == 'd' || *(p-6) == 'D')) {
+ state = 1;
+ p++;
+ goto state_1;
+ }
+ break;
+ default:
+ break;
+ }
+ p++;
+ goto state_3;
- /* fall-through */
- default:
-reg_char:
- if (state == 0) {
- *(rp++) = c;
- } else if (allow && state == 1) {
- if (tp - tbuf >= PHP_TAG_BUF_SIZE) {
- pos = tp - tbuf;
- tbuf = erealloc(tbuf, (tp - tbuf) + PHP_TAG_BUF_SIZE + 1);
- tp = tbuf + pos;
- }
- *(tp++) = c;
- }
- break;
+state_4:
+ while (p < end) {
+ c = *p;
+ if (c == '>' && !in_q) {
+ if (p >= buf + 2 && *(p-1) == '-' && *(p-2) == '-') {
+ in_q = state = 0;
+ tp = tbuf;
+ p++;
+ goto state_0;
+ }
}
- c = *(++p);
- i++;
+ p++;
}
+
+finish:
if (rp < rbuf + len) {
*rp = '\0';
}