From 1bbb55ec116308f893b0dc327916da492f886e77 Mon Sep 17 00:00:00 2001 From: Ilia Alshanetsky Date: Sun, 6 Oct 2002 16:14:42 +0000 Subject: [PATCH] Added a much faster parse_url() implementation. This also fixes a number of bugs in the old parse_url() implementation. --- ext/standard/url.c | 202 ++++++++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 83 deletions(-) diff --git a/ext/standard/url.c b/ext/standard/url.c index 6312d338c6..6a7153ce3c 100644 --- a/ext/standard/url.c +++ b/ext/standard/url.c @@ -85,107 +85,143 @@ PHPAPI char *php_replace_controlchars(char *str) */ PHPAPI php_url *php_url_parse(char *str) { - regex_t re; - regmatch_t subs[11]; - int err; int length = strlen(str); - char *result; + char port_buf[5]; php_url *ret = ecalloc(1, sizeof(php_url)); + char *s, *e, *p, *pp, *ue; + + s = str; + ue = s + length; - /* from Appendix B of draft-fielding-url-syntax-09, - http://www.ics.uci.edu/~fielding/url/url.txt */ - err = regcomp(&re, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\\?([^#]*))?(#(.*))?", REG_EXTENDED); - if (err) { - /*php_error(E_WARNING, "Unable to compile regex: %d\n", err);*/ - efree(ret); - return NULL; - } - err = regexec(&re, str, 10, subs, 0); - if (err) { - /*php_error(E_WARNING, "Error with regex\n");*/ - efree(ret); - regfree(&re); - return NULL; - } - /* no processing necessary on the scheme */ - if (subs[2].rm_so != -1 && subs[2].rm_so <= length) { - ret->scheme = estrndup(str + subs[2].rm_so, subs[2].rm_eo - subs[2].rm_so); + /* parse scheme */ + if ((e = strchr(s, ':')) && *(e+1) == '/' && *(e+2) == '/' && (e-s)) { + ret->scheme = estrndup(s, (e-s)); php_replace_controlchars(ret->scheme); - } - - /* the path to the resource */ - if (subs[5].rm_so != -1 && subs[5].rm_so <= length) { - ret->path = estrndup(str + subs[5].rm_so, subs[5].rm_eo - subs[5].rm_so); + s = e + 3; + } else if (e) { /* no scheme, look for port */ + p = e + 1; + pp = p; + + while (pp-p < 6 && isdigit(*pp)) { + pp++; + } + + if (pp-p < 6 && (*pp == '/' || *pp == '\0')) { + memcpy(port_buf, p, (pp-p)); + port_buf[pp-p] = '\0'; + ret->port = atoi(port_buf); + } else { + goto just_path; + } + } else { + just_path: + ret->path = estrndup(str, length); php_replace_controlchars(ret->path); + return ret; } - - /* the query part */ - if (subs[7].rm_so != -1 && subs[7].rm_so <= length) { - ret->query = estrndup(str + subs[7].rm_so, subs[7].rm_eo - subs[7].rm_so); - php_replace_controlchars(ret->query); - } - - /* the fragment */ - if (subs[9].rm_so != -1 && subs[9].rm_so <= length) { - ret->fragment = estrndup(str + subs[9].rm_so, subs[9].rm_eo - subs[9].rm_so); - php_replace_controlchars(ret->fragment); + + if (!(e = strchr(s, '/'))) { + e = ue; } - /* extract the username, pass, and port from the hostname */ - if (subs[4].rm_so != -1 && subs[4].rm_so <= length) { - - int cerr; - /* extract username:pass@host:port from regex results */ - result = estrndup(str + subs[4].rm_so, subs[4].rm_eo - subs[4].rm_so); - length = strlen(result); - - regfree(&re); /* free the old regex */ + /* check for login and password */ + if ((p = memchr(s, '@', (e-s)))) { + if ((pp = memchr(s, ':', (p-s)))) { + if ((pp-s) > 0) { + ret->user = estrndup(s, (pp-s)); + php_replace_controlchars(ret->user); + } - if (length) { - if ((cerr=regcomp(&re, "^(([^@:]+)(:([^@:]+))?@)?((\\[([^]]+)\\])|([^:@]+))(:([^:@]+))?", REG_EXTENDED)) - || (err=regexec(&re, result, 11, subs, 0))) { + if (p-pp > 1) { + ret->pass = estrndup(++pp, (p-pp-1)); + php_replace_controlchars(ret->pass); + } + } + + s = p + 1; + } + + /* check for port */ + if ((p = memchr(s, ':', (e-s)))) { + if (!ret->port) { + p++; + if ( e-p > 5 || e-p < 1 ) { /* port cannot be longer then 5 characters */ STR_FREE(ret->scheme); - STR_FREE(ret->path); - STR_FREE(ret->query); - STR_FREE(ret->fragment); + STR_FREE(ret->user); + STR_FREE(ret->pass); efree(ret); - efree(result); - /*php_error(E_WARNING, "Unable to compile regex: %d\n", err);*/ - if (!cerr) regfree(&re); return NULL; } - /* now deal with all of the results */ - if (subs[2].rm_so != -1 && subs[2].rm_so < length) { - ret->user = estrndup(result + subs[2].rm_so, subs[2].rm_eo - subs[2].rm_so); - php_replace_controlchars(ret->user); - } - if (subs[4].rm_so != -1 && subs[4].rm_so < length) { - ret->pass = estrndup(result + subs[4].rm_so, subs[4].rm_eo - subs[4].rm_so); - php_replace_controlchars(ret->pass); - } - if (subs[7].rm_so != -1 && subs[7].rm_so < length) { - ret->host = estrndup(result + subs[7].rm_so, subs[7].rm_eo - subs[7].rm_so); - php_replace_controlchars(ret->host); - } else if (subs[8].rm_so != -1 && subs[8].rm_so < length) { - ret->host = estrndup(result + subs[8].rm_so, subs[8].rm_eo - subs[8].rm_so); - php_replace_controlchars(ret->host); - } - if (subs[10].rm_so != -1 && subs[10].rm_so < length) { - ret->port = (unsigned short) strtol(result + subs[10].rm_so, NULL, 10); - } - } - efree(result); + + memcpy(port_buf, p, (e-p)); + port_buf[e-p] = '\0'; + ret->port = atoi(port_buf); + p--; + } + } else { + p = e; } - else if (ret->scheme && !strcmp(ret->scheme, "http")) { + + /* check if we have a valid host, if we don't reject the string as url */ + if ((p-s) < 1) { STR_FREE(ret->scheme); - STR_FREE(ret->path); - STR_FREE(ret->query); - STR_FREE(ret->fragment); + STR_FREE(ret->user); + STR_FREE(ret->pass); efree(ret); - regfree(&re); return NULL; } - regfree(&re); + + ret->host = estrndup(s, (p-s)); + php_replace_controlchars(ret->host); + + if (e == ue) { + return ret; + } + + s = e; + + if ((p = strchr(s, '?'))) { + pp = strchr(s, '#'); + + if (pp && pp < p) { + p = pp; + pp = strchr(pp+2, '#'); + } + + if (p - s) { + ret->path = estrndup(s, (p-s)); + php_replace_controlchars(ret->path); + } + + if (pp) { + if (pp - ++p) { + ret->query = estrndup(p, (pp-p)); + php_replace_controlchars(ret->query); + } + p = pp; + goto label_parse; + } else if (++p - ue) { + ret->query = estrndup(p, (ue-p)); + php_replace_controlchars(ret->query); + } + } else if ((p = strchr(s, '#'))) { + if (p - s) { + ret->path = estrndup(s, (p-s)); + php_replace_controlchars(ret->path); + } + + label_parse: + p++; + + if (ue - p) { + ret->fragment = estrndup(p, (ue-p)); + php_replace_controlchars(ret->fragment); + } + } else { + ret->path = estrndup(s, (ue-s)); + php_replace_controlchars(ret->path); + } + return ret; } /* }}} */ -- 2.40.0