From eaa5f1d9edd8ae2a5f637a3a07928dfee113accc Mon Sep 17 00:00:00 2001 From: Nick Mathewson <nickm@torproject.org> Date: Tue, 19 Oct 2010 11:26:59 -0400 Subject: [PATCH] Revise evhttp_uri_parse implementation to handle more of RFC3986 --- http.c | 354 ++++++++++++++++++++++++++++------- include/event2/http_struct.h | 8 +- test/regress_http.c | 143 ++++++++++++-- 3 files changed, 414 insertions(+), 91 deletions(-) diff --git a/http.c b/http.c index 87dc1f51..883c675f 100644 --- a/http.c +++ b/http.c @@ -2317,6 +2317,9 @@ static const char uri_chars[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; +#define CHAR_IS_UNRESERVED(c) \ + (uri_chars[(unsigned char)(c)]) + /* * Helper functions to encode/decode a string for inclusion in a URI. * The returned string must be freed by the caller. @@ -2337,7 +2340,7 @@ evhttp_uriencode(const char *uri, ev_ssize_t len, int space_as_plus) end = uri+strlen(uri); for (p = uri; p < end; p++) { - if (uri_chars[(unsigned char)(*p)]) { + if (CHAR_IS_UNRESERVED(*p)) { evbuffer_add(buf, p, 1); } else if (*p == ' ' && space_as_plus) { evbuffer_add(buf, "+", 1); @@ -3334,16 +3337,208 @@ bind_socket(const char *address, ev_uint16_t port, int reuse) return (fd); } +/* Return true of the string starting at s and ending immediately before eos + * is a valid URI scheme according to RFC3986 + */ +static int +scheme_ok(const char *s, const char *eos) +{ + /* scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) */ + EVUTIL_ASSERT(eos >= s); + if (s == eos) + return 0; + if (!EVUTIL_ISALPHA(*s)) + return 0; + while (++s < eos) { + if (! EVUTIL_ISALNUM(*s) && + *s != '+' && *s != '-' && *s != '.') + return 0; + } + return 1; +} + +#define SUBDELIMS "!$&'()*+,;=" + +/* Return true iff [s..eos) is a valid userinfo */ +static int +userinfo_ok(const char *s, const char *eos) +{ + while (s < eos) { + if (CHAR_IS_UNRESERVED(*s) || + strchr(SUBDELIMS, *s) || + *s == ':') + ++s; + else if (*s == '%' && s+2 < eos && + EVUTIL_ISXDIGIT(s[1]) && + EVUTIL_ISXDIGIT(s[2])) + s += 3; + else + return 0; + } + return 1; +} + +static int +regname_ok(const char *s, const char *eos) +{ + while (s && s<eos) { + if (CHAR_IS_UNRESERVED(*s) || + strchr(SUBDELIMS, *s)) + ++s; + else if (*s == '%' && + EVUTIL_ISXDIGIT(s[1]) && + EVUTIL_ISXDIGIT(s[2])) + s += 3; + else + return 0; + } + return 1; +} + +static int +parse_port(const char *s, const char *eos) +{ + int portnum = 0; + if (s == eos) + return 0; /* The RFC allows an empty port. */ + while (s < eos) { + if (! EVUTIL_ISDIGIT(*s)) + return -1; + portnum = (portnum * 10) + (*s - '0'); + ++s; + } + return portnum; +} + +/* returns 0 for bad, 1 for ipv6, 2 for IPvFuture */ +static int +bracket_addr_ok(const char *s, const char *eos) +{ + if (s + 3 > eos || *s != '[' || *(eos-1) != ']') + return 0; + if (s[1] == 'v') { + /* IPvFuture, or junk. + "v" 1*HEXDIG "." 1*( unreserved / sub-delims / ":" ) + */ + s += 2; /* skip [v */ + --eos; + if (!EVUTIL_ISXDIGIT(*s)) /*require at least one*/ + return 0; + while (s < eos && *s != '.') { + if (EVUTIL_ISXDIGIT(*s)) + ++s; + else + return 0; + } + if (*s != '.') + return 0; + ++s; + while (s < eos) { + if (CHAR_IS_UNRESERVED(*s) || + strchr(SUBDELIMS, *s) || + *s == ':') + ++s; + else + return 0; + } + return 2; + } else { + /* IPv6, or junk */ + char buf[64]; + int n_chars = eos-s-2; + struct in6_addr in6; + if (n_chars >= 64) /* way too long */ + return 0; + memcpy(buf, s+1, n_chars); + buf[n_chars]='\0'; + return (evutil_inet_pton(AF_INET6,buf,&in6)==1) ? 1 : 0; + } +} + +static int +parse_authority(struct evhttp_uri *uri, char *s, char *eos) +{ + char *cp, *port; + EVUTIL_ASSERT(eos); + if (eos == s) { + uri->host = mm_strdup(""); + return 0; + } + + /* Optionally, we start with "userinfo@" */ + + cp = strchr(s, '@'); + if (cp && cp < eos) { + if (! userinfo_ok(s,cp)) + return -1; + *cp++ = '\0'; + uri->userinfo = mm_strdup(s); + } else { + cp = s; + } + /* Optionally, we end with ":port" */ + for (port=eos-1; port >= cp && EVUTIL_ISDIGIT(*port); --port) + ; + if (port >= cp && *port == ':') { + if ((uri->port = parse_port(port+1, eos))<0) + return -1; + eos = port; + } + /* Now, cp..eos holds the "host" port, which can be an IPv4Address, + * an IP-Literal, or a reg-name */ + EVUTIL_ASSERT(eos >= cp); + if (*cp == '[' && eos >= cp+2 && *(eos-1) == ']') { + /* IPv6address, IP-Literal, or junk. */ + if (! bracket_addr_ok(cp, eos)) + return -1; + } else { + /* Make sure the host part is ok. */ + if (! regname_ok(cp,eos)) /* Match IPv4Address or reg-name */ + return -1; + } + uri->host = mm_malloc(eos-cp+1); + memcpy(uri->host, cp, eos-cp); + uri->host[eos-cp] = '\0'; + return 0; + +} + +/* Return the character after the longest prefix of 'cp' that matches... + * *pchar / "/" if allow_qchars is false, or + * *(pchar / "/" / "?") if allow_chars is true. + */ +static char * +end_of_path(char *cp, int allow_qchars) +{ + while (*cp) { + if (CHAR_IS_UNRESERVED(*cp) || + strchr(SUBDELIMS, *cp) || + *cp == ':' || *cp == '@' || *cp == '/') + ++cp; + else if (*cp == '%' && EVUTIL_ISXDIGIT(cp[1]) && + EVUTIL_ISXDIGIT(cp[2])) + cp += 3; + else if (*cp == '?' && allow_qchars) + ++cp; + else + return cp; + } + return cp; +} + struct evhttp_uri * evhttp_uri_parse(const char *source_uri) { - char *readbuf = NULL, *readp = NULL, *token = NULL, *query = NULL, *host = NULL, *port = NULL; + char *readbuf = NULL, *readp = NULL, *token = NULL, *query = NULL; + char *path = NULL, *fragment = NULL; + int got_authority = 0; struct evhttp_uri *uri = mm_calloc(1, sizeof(struct evhttp_uri)); if (uri == NULL) { event_err(1, "%s: calloc", __func__); goto err; } + uri->port = -1; readbuf = mm_strdup(source_uri); if (readbuf == NULL) { @@ -3354,57 +3549,79 @@ evhttp_uri_parse(const char *source_uri) readp = readbuf; token = NULL; - /* 1. scheme:// */ - token = strstr(readp, "://"); - if (!token) { - /* unsupported uri */ - goto err; - } + /* We try to follow RFC3986 here as much as we can, and match + the productions - *token = '\0'; - uri->scheme = mm_strdup(readp); + URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] - readp = token; - readp += 3; /* eat :// */ + relative-ref = relative-part [ "?" query ] [ "#" fragment ] - /* 2. query */ - query = strchr(readp, '/'); - if (query) { - char *fragment = strchr(query, '#'); - if (fragment) { - *fragment++ = '\0'; /* eat '#' */ - uri->fragment = mm_strdup(fragment); - } + */ - uri->query = mm_strdup(query); - *query = '\0'; /* eat '/' */ - } - - /* 3. user:pass@host:port */ - host = strchr(readp, '@'); - if (host) { - char *pass = 0; - /* got user:pass@host:port */ - *host++ = '\0'; /* eat @ */; - pass = strchr(readp, ':'); - if (pass) { - *pass++ = '\0'; /* eat ':' */ - uri->pass = mm_strdup(pass); - } + /* 1. scheme: */ + token = strchr(readp, ':'); + if (token && scheme_ok(readp,token)) { + *token = '\0'; + uri->scheme = mm_strdup(readp); + + readp = token+1; /* eat : */ + } - uri->user = mm_strdup(readp); - readp = host; + /* 2. Optionally, "//" then an 'authority' part. */ + if (readp[0]=='/' && readp[1] == '/') { + char *authority; + readp += 2; + authority = readp; + path = strchr(readp, '/'); /*XXXX path can be empty; we can + * have a query though */ + if (!path) + path = strchr(readp, '\0'); + if (parse_authority(uri, authority, path) < 0) + goto err; + readp = path; + got_authority = 1; } - /* 4. host:port */ - port = strchr(readp, ':'); - if (port) { - *port++ = '\0'; /* eat ':' */ - uri->port = atoi(port); + /* 3. Query: path-abempty, path-absolute, path-rootless, or path-empty + */ + path = readp; + readp = end_of_path(path, 0); + + /* Query */ + if (*readp == '?') { + *readp = '\0'; + ++readp; + query = readp; + readp = end_of_path(readp, 1); + } + /* fragment */ + if (*readp == '#') { + *readp = '\0'; + ++readp; + fragment = readp; + readp = end_of_path(readp, 1); + } + if (*readp != '\0') { + goto err; } - /* 5. host */ - uri->host = mm_strdup(readp); + /* If you didn't get an authority, the path can't begin with "//" */ + if (!got_authority && path[0]=='/' && path[1]=='/') + goto err; + /* If you did get an authority, the path must begin with "/" or be + * empty. */ + if (got_authority && path[0] != '/' && path[0] != '\0') + goto err; + + if (path) + uri->path = mm_strdup(path); + else + uri->path = mm_strdup(""); + + if (query) + uri->query = mm_strdup(query); + if (fragment) + uri->fragment = mm_strdup(fragment); mm_free(readbuf); @@ -3426,8 +3643,7 @@ evhttp_uri_free(struct evhttp_uri *uri) } _URI_FREE_STR(scheme); - _URI_FREE_STR(user); - _URI_FREE_STR(pass); + _URI_FREE_STR(userinfo); _URI_FREE_STR(host); _URI_FREE_STR(query); _URI_FREE_STR(fragment); @@ -3442,40 +3658,42 @@ evhttp_uri_join(struct evhttp_uri *uri, char *buf, size_t limit) { struct evbuffer *tmp = 0; size_t joined_size = 0; + char *output = NULL; #define _URI_ADD(f) evbuffer_add(tmp, uri->f, strlen(uri->f)) - if (!uri || !uri->scheme || !buf || !limit) + + if (!uri || !buf || !limit) return NULL; tmp = evbuffer_new(); if (!tmp) return NULL; - _URI_ADD(scheme); - evbuffer_add(tmp, "://", 3); - if (uri->host && *uri->host) { - if (uri->user && *uri->user) { - _URI_ADD(user); - if (uri->pass && *uri->pass) { - evbuffer_add(tmp, ":", 1); - _URI_ADD(pass); - } - evbuffer_add(tmp, "@", 1); - } - + if (uri->scheme) { + _URI_ADD(scheme); + evbuffer_add(tmp, ":", 1); + } + if (uri->host) { + evbuffer_add(tmp, "//", 2); + if (uri->userinfo) + evbuffer_add_printf(tmp,"%s@", uri->userinfo); _URI_ADD(host); + if (uri->port >= 0) + evbuffer_add_printf(tmp,":%d", uri->port); - if (uri->port > 0) - evbuffer_add_printf(tmp,":%u", uri->port); + if (uri->path && uri->path[0] != '/' && uri->path[0] != '\0') + goto err; } - if (uri->query && *uri->query) - _URI_ADD(query); + if (uri->path) + _URI_ADD(path); - if (uri->fragment && *uri->fragment) { - if (!uri->query || !*uri->query) - evbuffer_add(tmp, "/", 1); + if (uri->query) { + evbuffer_add(tmp, "?", 1); + _URI_ADD(query); + } + if (uri->fragment) { evbuffer_add(tmp, "#", 1); _URI_ADD(fragment); } @@ -3491,8 +3709,10 @@ evhttp_uri_join(struct evhttp_uri *uri, char *buf, size_t limit) } evbuffer_remove(tmp, buf, joined_size); + output = buf; +err: evbuffer_free(tmp); - return (char *)buf; + return output; #undef _URI_ADD } diff --git a/include/event2/http_struct.h b/include/event2/http_struct.h index 168f5aca..26487799 100644 --- a/include/event2/http_struct.h +++ b/include/event2/http_struct.h @@ -123,11 +123,11 @@ struct { */ struct evhttp_uri { char *scheme; /* scheme; e.g http, ftp etc */ - char *host; /* hostname, or NULL */ - char *user; /* usename, or NULL */ - char *pass; /* password, or NULL */ + char *host; /* hostname, IP address, or NULL */ + char *userinfo; /* userinfo (typically username:pass), or NULL */ int port; /* port, or zero */ - char *query; /* path + query: e.g. /path/to?param=foo, or NULL */ + char *path; /* path, or NULL */ + char *query; /* query, or NULL */ char *fragment; /* fragment or NULL */ }; diff --git a/test/regress_http.c b/test/regress_http.c index 17043219..fee9a263 100644 --- a/test/regress_http.c +++ b/test/regress_http.c @@ -1725,15 +1725,23 @@ http_parse_uri_test(void *ptr) tt_want(evhttp_uri_join(0, url_tmp, sizeof(url_tmp)) == NULL); tt_want(evhttp_uri_join(uri, url_tmp, sizeof(url_tmp)) == NULL); - tt_want(evhttp_uri_parse("mailto:foo@bar") == NULL); + uri = evhttp_uri_parse("mailto:foo@bar"); + tt_want(uri != NULL); + tt_want(uri->host == NULL); + tt_want(uri->userinfo == NULL); + tt_want(uri->port == -1); + tt_want(!strcmp(uri->scheme, "mailto")); + tt_want(!strcmp(uri->path, "foo@bar")); + tt_want(uri->query == NULL); + tt_want(uri->fragment == NULL); uri = evhttp_uri_parse("http://www.test.com/?q=test"); tt_want(strcmp(uri->scheme, "http") == 0); tt_want(strcmp(uri->host, "www.test.com") == 0); - tt_want(strcmp(uri->query, "/?q=test") == 0); - tt_want(uri->user == NULL); - tt_want(uri->pass == NULL); - tt_want(uri->port == 0); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(strcmp(uri->query, "q=test") == 0); + tt_want(uri->userinfo == NULL); + tt_want(uri->port == -1); tt_want(uri->fragment == NULL); TT_URI("http://www.test.com/?q=test"); evhttp_uri_free(uri); @@ -1741,45 +1749,140 @@ http_parse_uri_test(void *ptr) uri = evhttp_uri_parse("ftp://www.test.com/?q=test"); tt_want(strcmp(uri->scheme, "ftp") == 0); tt_want(strcmp(uri->host, "www.test.com") == 0); - tt_want(strcmp(uri->query, "/?q=test") == 0); - tt_want(uri->user == NULL); - tt_want(uri->pass == NULL); - tt_want(uri->port == 0); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(strcmp(uri->query, "q=test") == 0); + tt_want(uri->userinfo == NULL); + tt_want(uri->port == -1); tt_want(uri->fragment == NULL); TT_URI("ftp://www.test.com/?q=test"); evhttp_uri_free(uri); + uri = evhttp_uri_parse("ftp://[::1]:999/?q=test"); + tt_want(strcmp(uri->scheme, "ftp") == 0); + tt_want(strcmp(uri->host, "[::1]") == 0); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(strcmp(uri->query, "q=test") == 0); + tt_want(uri->userinfo == NULL); + tt_want(uri->port == 999); + tt_want(uri->fragment == NULL); + TT_URI("ftp://[::1]:999/?q=test"); + evhttp_uri_free(uri); + + uri = evhttp_uri_parse("ftp://[ff00::127.0.0.1]/?q=test"); + tt_want(strcmp(uri->scheme, "ftp") == 0); + tt_want(strcmp(uri->host, "[ff00::127.0.0.1]") == 0); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(strcmp(uri->query, "q=test") == 0); + tt_want(uri->userinfo == NULL); + tt_want(uri->port == -1); + tt_want(uri->fragment == NULL); + TT_URI("ftp://[ff00::127.0.0.1]/?q=test"); + evhttp_uri_free(uri); + + uri = evhttp_uri_parse("ftp://[v99.not_anytime_soon]/?q=test"); + tt_want(strcmp(uri->scheme, "ftp") == 0); + tt_want(strcmp(uri->host, "[v99.not_anytime_soon]") == 0); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(strcmp(uri->query, "q=test") == 0); + tt_want(uri->userinfo == NULL); + tt_want(uri->port == -1); + tt_want(uri->fragment == NULL); + TT_URI("ftp://[v99.not_anytime_soon]/?q=test"); + evhttp_uri_free(uri); + uri = evhttp_uri_parse("scheme://user:pass@foo.com:42/?q=test&s=some+thing#fragment"); tt_want(strcmp(uri->scheme, "scheme") == 0); - tt_want(strcmp(uri->user, "user") == 0); - tt_want(strcmp(uri->pass, "pass") == 0); + tt_want(strcmp(uri->userinfo, "user:pass") == 0); tt_want(strcmp(uri->host, "foo.com") == 0); tt_want(uri->port == 42); - tt_want(strcmp(uri->query, "/?q=test&s=some+thing") == 0); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(strcmp(uri->query, "q=test&s=some+thing") == 0); tt_want(strcmp(uri->fragment, "fragment") == 0); TT_URI("scheme://user:pass@foo.com:42/?q=test&s=some+thing#fragment"); evhttp_uri_free(uri); uri = evhttp_uri_parse("scheme://user@foo.com/#fragment"); tt_want(strcmp(uri->scheme, "scheme") == 0); - tt_want(strcmp(uri->user, "user") == 0); - tt_want(uri->pass == NULL); + tt_want(strcmp(uri->userinfo, "user") == 0); tt_want(strcmp(uri->host, "foo.com") == 0); - tt_want(uri->port == 0); - tt_want(strcmp(uri->query, "/") == 0); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "/") == 0); + tt_want(uri->query == NULL); tt_want(strcmp(uri->fragment, "fragment") == 0); TT_URI("scheme://user@foo.com/#fragment"); evhttp_uri_free(uri); + uri = evhttp_uri_parse("file:///some/path/to/the/file"); tt_want(strcmp(uri->scheme, "file") == 0); - tt_want(uri->user == NULL); - tt_want(uri->pass == NULL); + tt_want(uri->userinfo == NULL); tt_want(strcmp(uri->host, "") == 0); - tt_want(uri->port == 0); - tt_want(strcmp(uri->query, "/some/path/to/the/file") == 0); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "/some/path/to/the/file") == 0); + tt_want(uri->query == NULL); tt_want(uri->fragment == NULL); TT_URI("file:///some/path/to/the/file"); evhttp_uri_free(uri); + + uri = evhttp_uri_parse("///some/path/to/the-file"); + tt_want(uri != NULL); + tt_want(uri->scheme == NULL); + tt_want(uri->userinfo == NULL); + tt_want(strcmp(uri->host, "") == 0); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "/some/path/to/the-file") == 0); + tt_want(uri->query == NULL); + tt_want(uri->fragment == NULL); + TT_URI("///some/path/to/the-file"); + evhttp_uri_free(uri); + + uri = evhttp_uri_parse("/s:ome/path/to/the-file?q=99#fred"); + tt_want(uri != NULL); + tt_want(uri->scheme == NULL); + tt_want(uri->userinfo == NULL); + tt_want(uri->host == NULL); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "/s:ome/path/to/the-file") == 0); + tt_want(strcmp(uri->query, "q=99") == 0); + tt_want(strcmp(uri->fragment, "fred") == 0); + TT_URI("/s:ome/path/to/the-file?q=99#fred"); + evhttp_uri_free(uri); + + uri = evhttp_uri_parse("relative/path/with/co:lon"); + tt_want(uri != NULL); + tt_want(uri->scheme == NULL); + tt_want(uri->userinfo == NULL); + tt_want(uri->host == NULL); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "relative/path/with/co:lon") == 0); + tt_want(uri->query == NULL); + tt_want(uri->fragment == NULL); + TT_URI("relative/path/with/co:lon"); + evhttp_uri_free(uri); + + uri = evhttp_uri_parse("bob?q=99&q2=q?33#fr?ed"); + tt_want(uri != NULL); + tt_want(uri->scheme == NULL); + tt_want(uri->userinfo == NULL); + tt_want(uri->host == NULL); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "bob") == 0); + tt_want(strcmp(uri->query, "q=99&q2=q?33") == 0); + tt_want(strcmp(uri->fragment, "fr?ed") == 0); + TT_URI("bob?q=99&q2=q?33#fr?ed"); + evhttp_uri_free(uri); + + uri = evhttp_uri_parse("#fr?ed"); + tt_want(uri != NULL); + tt_want(uri->scheme == NULL); + tt_want(uri->userinfo == NULL); + tt_want(uri->host == NULL); + tt_want(uri->port == -1); + tt_want(strcmp(uri->path, "") == 0); + tt_want(uri->query == NULL); + tt_want(strcmp(uri->fragment, "fr?ed") == 0); + TT_URI("#fr?ed"); + evhttp_uri_free(uri); + } static void -- 2.40.0