From 11d157421184db8de73e60e936014fcd49dd598d Mon Sep 17 00:00:00 2001 From: Rolland Santimano Date: Mon, 3 Oct 2005 17:00:00 +0000 Subject: [PATCH] - Unicode impl of strip_tags() --- ext/standard/string.c | 336 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 307 insertions(+), 29 deletions(-) diff --git a/ext/standard/string.c b/ext/standard/string.c index fc48769dc2..114ec7f866 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -4803,34 +4803,30 @@ PHP_FUNCTION(nl2br) Strips HTML and PHP tags from a string */ PHP_FUNCTION(strip_tags) { - char *buf; - zval **str, **allow=NULL; - char *allowed_tags=NULL; - int allowed_tags_len=0; - size_t retval_len; + void *str, *allow=NULL; + int32_t str_len, allow_len; + zend_uchar str_type, allow_type; + void *buf; + int32_t retval_len; - switch (ZEND_NUM_ARGS()) { - case 1: - if (zend_get_parameters_ex(1, &str) == FAILURE) { - RETURN_FALSE; - } - break; - case 2: - if (zend_get_parameters_ex(2, &str, &allow) == FAILURE) { - RETURN_FALSE; - } - convert_to_string_ex(allow); - allowed_tags = Z_STRVAL_PP(allow); - allowed_tags_len = Z_STRLEN_PP(allow); - break; - default: - WRONG_PARAM_COUNT; - break; + if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "T|T", &str, &str_len, &str_type, + &allow, &allow_len, &allow_type) == FAILURE) { + return; + } + + if (str_type == IS_UNICODE) { + buf = eustrndup(str, str_len); + retval_len = php_u_strip_tags((UChar *)buf, str_len, NULL, (UChar *)allow, allow_len TSRMLS_CC); + RETURN_UNICODEL((UChar *)buf, retval_len, 0); + } else { + buf = estrndup(str, str_len); + retval_len = php_strip_tags((char *)buf, str_len, NULL, (char *)allow, allow_len); + if (str_type == IS_BINARY) { + RETURN_BINARYL((char *)buf, retval_len, 0); + } else { + RETURN_STRINGL((char *)buf, retval_len, 0); + } } - convert_to_string_ex(str); - buf = estrndup(Z_STRVAL_PP(str), Z_STRLEN_PP(str)); - retval_len = php_strip_tags(buf, Z_STRLEN_PP(str), NULL, allowed_tags, allowed_tags_len); - RETURN_STRINGL(buf, retval_len, 0); } /* }}} */ @@ -4971,15 +4967,72 @@ PHP_FUNCTION(parse_str) #define PHP_TAG_BUF_SIZE 1023 -/* {{{ php_tag_find +/* php_u_tag_find / php_tag_find * * Check if tag is in a set of tags * * states: - * + * * 0 start tag * 1 first non-whitespace char seen */ + +/* {{{ php_u_tag_find + */ +int php_u_tag_find(UChar *tag, int32_t len, UChar *set, int32_t set_len) +{ + int32_t idx = 0; + UChar32 ch, *norm, *n; + int state = 0, done = 0; + + if (!len) { + return 0; + } + + norm = eumalloc(len+1); + n = norm; + + while (!done) { + U16_NEXT(tag, idx, len, ch); + switch (u_tolower(ch)) { + case '<': + *(n++) = ch; + break; + case '>': + done = 1; + break; + default: + if (u_isWhitespace(ch) == FALSE) { + if (state == 0) { + state = 1; + if (ch != '/') + *(n++) = ch; + } else { + *(n++) = ch; + } + } else { + if (state == 1) + done = 1; + } + break; + } + } + *(n++) = '>'; + *n = '\0'; + + if (u_strFindFirst(tag, len, set, set_len) != NULL) { + done = 1; + } else { + done = 0; + } + + efree(norm); + return done; +} +/* }}} */ + +/* {{{ php_tag_find + */ int php_tag_find(char *tag, int len, char *set) { char c, *n, *t; int state=0, done=0; @@ -5033,7 +5086,7 @@ int php_tag_find(char *tag, int len, char *set) { } /* }}} */ -/* {{{ php_strip_tags +/* php_u_strip_tags / php_strip_tags A simple little state-machine to strip out html and php tags @@ -5053,6 +5106,231 @@ int php_tag_find(char *tag, int len, char *set) { swm: Added ability to strip = UBYTES(PHP_TAG_BUF_SIZE) ? tbuf: tp); + *(tp++) = ch; + } + } else if (state == 1) { + depth++; + } + break; + + case 0x28: /* '(' */ + if (state == 2) { + if (last != 0x22 && last != 0x27) { /* '"' & '\'' */ + last = 0x28; + br++; + } + } else if (allow && state == 1) { + tp = ((tp-tbuf) >= UBYTES(PHP_TAG_BUF_SIZE) ? tbuf: tp); + *(tp++) = ch; + } else if (state == 0) { + *(rp++) = ch; + } + break; + + case 0x29: /* ')' */ + if (state == 2) { + if (last != 0x22 && last != 0x27) { /* '"' & '\'' */ + last = ch; + br--; + } + } else if (allow && state == 1) { + tp = ((tp-tbuf) >= UBYTES(PHP_TAG_BUF_SIZE) ? tbuf: tp); + *(tp++) = ch; + } else if (state == 0) { + *(rp++) = ch; + } + break; + + case 0x3E: /* '>' */ + if (depth) { + depth--; + break; + } + + switch (state) { + case 1: /* HTML/XML */ + last = ch; + state = 0; + if (allow) { + tp = ((tp-tbuf) >= UBYTES(PHP_TAG_BUF_SIZE) ? tbuf: tp); + *(tp++) = ch; + *(tp) = 0; + if (php_u_tag_find(tbuf, tp-tbuf, allow, allow_len)) { + u_memcpy(rp, tbuf, tp-tbuf); + rp += tp-tbuf; + } + tp = tbuf; + } + break; + + case 2: /* PHP */ + if (!br && last != 0x22 && prev1 == 0x3F) { /* '"' & '?' */ + state = 0; + tp = tbuf; + } + break; + + case 3: + state = 0; + tp = tbuf; + break; + + case 4: /* JavaScript/CSS/etc... */ + if (codepts >= 2 && prev1 == 0x2D && prev2 == 0x2D) { /* '-' */ + state = 0; + tp = tbuf; + } + break; + + default: + *(rp++) = ch; + break; + } + break; + + case 0x22: /* '"' */ + case 0x27: /* '\'' */ + if (state == 2 && prev1 != 0x5C) { /* '\\' */ + if (last == ch) { + last = 0x00; + } else if (last != 0x5C) { + last = ch; + } + } else if (state == 0) { + *(rp++) = ch; + } else if (allow && state == 1) { + tp = ((tp-tbuf) >= UBYTES(PHP_TAG_BUF_SIZE) ? tbuf: tp); + *(tp++) = ch; + } + break; + + case 0x21: /* '!' */ + /* JavaScript & Other HTML scripting languages */ + if (state == 1 && prev1 == 0x3C) { /* '<' */ + state = 3; + last = ch; + } else { + if (state == 0) { + (*rp++) = 0x21; + } else if (allow && state == 1) { + tp = ((tp-tbuf) >= PHP_TAG_BUF_SIZE ? tbuf: tp); + *(tp++) = 0x21; + } + } + break; + + case 0x2D: /* '-' */ + if (state == 3 && codepts >= 2 && prev1 == 0x2D && prev2 == 0x21) { /* '-' & '!' */ + state = 4; + } else { + goto reg_u_char; + } + break; + + case 0x3F: /* '?' */ + if (state == 1 && prev1 == 0x3C) { /* '<' */ + br = 0; + state = 2; + break; + } + + case 'E': + case 'e': + /* !DOCTYPE exception */ + if (state==3 && codepts > 6) { + tmp = idx; + for (i = 0 ; i < 6 ; i++) { + U16_PREV(buf, 0, tmp, doctype[i]); + } + if (u_tolower(doctype[0])=='p' && u_tolower(doctype[1])=='y' && + u_tolower(doctype[2])=='t' && u_tolower(doctype[3])=='c' && + u_tolower(doctype[4])=='o' && u_tolower(doctype[5])=='d') { + state = 1; + break; + } + } + /* fall-through */ + + case 'l': + + /* swm: If we encounter ' 2 && prev1 == 'm' && prev2 == 'x') { + state = 1; + break; + } + /* fall-through */ + + default: +reg_u_char: + if (state == 0) { + rp += zend_codepoint_to_uchar(ch, rp); + } else if (allow && state == 1) { + tp = ((tp-tbuf) >= UBYTES(PHP_TAG_BUF_SIZE) ? tbuf: tp); + tp += zend_codepoint_to_uchar(ch, tp); + } + break; + } + } + + *rp = 0; + efree(buf); + if (allow) + efree(tbuf); + if (stateptr) + *stateptr = state; + + return (size_t)(rp-rbuf); +} +/* }}} */ + +/* {{{ php_strip_tags + */ PHPAPI size_t php_strip_tags(char *rbuf, int len, int *stateptr, char *allow, int allow_len) { char *tbuf, *buf, *p, *tp, *rp, c, lc; -- 2.40.0