1 /* Copyright (c) 2003-11, WebThing Ltd
2 * Copyright (c) 2011-, The Apache Software Foundation
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 You can #define GO_FASTER to disable trace logging.
28 #define VERBOSE(x) if (verbose) x
29 #define VERBOSEB(x) if (verbose) {x}
32 /* libxml2 includes unicode/*.h files which uses C++ comments */
34 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
35 #pragma GCC diagnostic push
37 #pragma GCC diagnostic warning "-Wcomment"
38 #elif defined(__clang__)
39 #pragma clang diagnostic push
40 #pragma clang diagnostic warning "-Wcomment"
44 #include <libxml/HTMLparser.h>
47 #if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
48 #pragma GCC diagnostic pop
50 #elif defined(__clang__)
51 #pragma clang diagnostic pop
54 #include "http_protocol.h"
55 #include "http_config.h"
57 #include "apr_strings.h"
59 #include "apr_strmatch.h"
62 #include "apr_optional.h"
63 #include "mod_xml2enc.h"
64 #include "http_request.h"
67 /* globals set once at startup */
68 static ap_rxplus_t *old_expr;
69 static ap_regex_t *seek_meta;
70 static const apr_strmatch_pattern* seek_content;
71 static apr_status_t (*xml2enc_charset)(request_rec*, xmlCharEncoding*, const char**) = NULL;
72 static apr_status_t (*xml2enc_filter)(request_rec*, const char*, unsigned int) = NULL;
74 module AP_MODULE_DECLARE_DATA proxy_html_module;
80 #define M_ATSTART 0x10
83 #define M_NOTLAST 0x80
84 #define M_INTERPOLATE_TO 0x100
85 #define M_INTERPOLATE_FROM 0x200
94 typedef struct urlmap {
97 unsigned int regflags;
103 ap_expr_info_t *cond;
112 apr_array_header_t *events;
113 const char *charset_out;
122 proxy_html_conf *cfg;
123 htmlParserCtxtPtr parser;
124 apr_bucket_brigade *bb;
128 const char *encoding;
138 #define NORM_MSSLASH 0x2
139 #define NORM_RESET 0x4
140 static htmlSAXHandler sax;
142 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t;
144 static const char *const fpi_html =
145 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n";
146 static const char *const fpi_html_legacy =
147 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n";
148 static const char *const fpi_xhtml =
149 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
150 static const char *const fpi_xhtml_legacy =
151 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
152 static const char *const fpi_html5 = "<!DOCTYPE html>\n";
153 static const char *const html_etag = ">";
154 static const char *const xhtml_etag = " />";
155 /*#define DEFAULT_DOCTYPE fpi_html */
156 static const char *const DEFAULT_DOCTYPE = "";
157 #define DEFAULT_ETAG html_etag
159 static void normalise(unsigned int flags, char *str)
163 for (p = str; *p; ++p)
167 if (flags & NORM_MSSLASH)
168 for (p = ap_strchr(str, '\\'); p; p = ap_strchr(p+1, '\\'))
172 #define consume_buffer(ctx,inbuf,bytes,flag) \
173 htmlParseChunk(ctx->parser, inbuf, bytes, flag)
175 #define AP_fwrite(ctx,inbuf,bytes,flush) \
176 ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
178 /* This is always utf-8 on entry. We can convert charset within FLUSH */
179 #define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0); begin = i+1
180 static void pcharacters(void *ctxt, const xmlChar *uchars, int length)
182 const char *chars = (const char*) uchars;
183 saxctxt *ctx = (saxctxt*) ctxt;
186 for (begin=i=0; i<length; i++) {
188 case '&' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&"); break;
189 case '<' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "<"); break;
190 case '>' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, ">"); break;
191 case '"' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, """); break;
198 static void preserve(saxctxt *ctx, const size_t len)
201 if (len <= (ctx->avail - ctx->offset))
203 else while (len > (ctx->avail - ctx->offset))
204 ctx->avail += ctx->cfg->bufsz;
206 newbuf = realloc(ctx->buf, ctx->avail);
207 if (newbuf != ctx->buf) {
209 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf,
210 (int(*)(void*))free);
211 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
212 (int(*)(void*))free, apr_pool_cleanup_null);
217 static void pappend(saxctxt *ctx, const char *buf, const size_t len)
220 memcpy(ctx->buf+ctx->offset, buf, len);
224 static void dump_content(saxctxt *ctx)
232 ap_regmatch_t pmatch[10];
235 urlmap *themap = ctx->map;
237 int verbose = APLOGrtrace1(ctx->f->r);
240 pappend(ctx, &c, 1); /* append null byte */
241 /* parse the text for URLs */
242 for (m = themap; m; m = m->next) {
243 if (!(m->flags & M_CDATA))
245 if (m->flags & M_REGEX) {
248 while (!ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0)) {
249 match = pmatch[0].rm_so;
250 s_from = pmatch[0].rm_eo - match;
251 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
254 len = strlen(ctx->buf);
257 const char *f = apr_pstrndup(ctx->f->r->pool,
258 ctx->buf + offs, s_from);
259 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
260 "C/RX: match at %s, substituting %s", f, subs);
263 preserve(ctx, s_to - s_from);
264 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
265 len + 1 - s_from - offs);
266 memcpy(ctx->buf+offs, subs, s_to);
269 memcpy(ctx->buf + offs, subs, s_to);
270 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
271 len + 1 - s_from - offs);
277 s_from = strlen(m->from.c);
278 s_to = strlen(m->to);
279 for (found = strstr(ctx->buf, m->from.c); found;
280 found = strstr(ctx->buf+match+s_to, m->from.c)) {
281 match = found - ctx->buf;
282 if ((m->flags & M_ATSTART) && (match != 0))
284 len = strlen(ctx->buf);
285 if ((m->flags & M_ATEND) && (match < (len - s_from)))
287 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
288 "C: matched %s, substituting %s",
291 preserve(ctx, s_to - s_from);
292 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
293 len + 1 - s_from - match);
294 memcpy(ctx->buf+match, m->to, s_to);
297 memcpy(ctx->buf+match, m->to, s_to);
298 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
299 len + 1 - s_from - match);
304 AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1);
306 static void pinternalSubset(void* ctxt, const xmlChar *name,
307 const xmlChar *externalID, const xmlChar *sysID)
309 saxctxt* ctx = (saxctxt*) ctxt;
310 if (!ctxt || !name) {
314 if (ctx->cfg->doctype != DEFAULT_DOCTYPE) {
315 /* do nothing if overridden in config */
318 ap_fputstrs(ctx->f->next, ctx->bb, "<!DOCTYPE ", (const char *)name, NULL);
320 if (!ap_cstr_casecmp((const char*)name, "html") &&
321 !ap_cstr_casecmpn((const char *)externalID, "-//W3C//DTD XHTML ", 18)) {
322 ctx->etag = xhtml_etag;
325 ctx->etag = html_etag;
327 ap_fputstrs(ctx->f->next, ctx->bb, " PUBLIC \"", (const char *)externalID, "\"", NULL);
329 ap_fputstrs(ctx->f->next, ctx->bb, " \"", (const char *)sysID, "\"", NULL);
331 ap_fputs(ctx->f->next, ctx->bb, ">\n");
333 static void pcdata(void *ctxt, const xmlChar *uchars, int length)
335 const char *chars = (const char*) uchars;
336 saxctxt *ctx = (saxctxt*) ctxt;
337 if (ctx->cfg->extfix) {
338 pappend(ctx, chars, length);
341 /* not sure if this should force-flush
342 * (i.e. can one cdata section come in multiple calls?)
344 AP_fwrite(ctx, chars, length, 0);
347 static void pcomment(void *ctxt, const xmlChar *uchars)
349 const char *chars = (const char*) uchars;
350 saxctxt *ctx = (saxctxt*) ctxt;
351 if (ctx->cfg->strip_comments)
354 if (ctx->cfg->extfix) {
355 pappend(ctx, "<!--", 4);
356 pappend(ctx, chars, strlen(chars));
357 pappend(ctx, "-->", 3);
360 ap_fputs(ctx->f->next, ctx->bb, "<!--");
361 AP_fwrite(ctx, chars, strlen(chars), 1);
362 ap_fputs(ctx->f->next, ctx->bb, "-->");
366 static void pendElement(void *ctxt, const xmlChar *uname)
368 saxctxt *ctx = (saxctxt*) ctxt;
369 const char *name = (const char*) uname;
370 const htmlElemDesc* desc = htmlTagLookup(uname);
372 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
374 if (!desc || desc->depr)
378 else if ((ctx->cfg->doctype == fpi_html_legacy)
379 || (ctx->cfg->doctype == fpi_xhtml_legacy)) {
380 /* enforce html legacy */
384 /* TODO - implement HTML "allowed here" using the stack */
385 /* nah. Keeping the stack is too much overhead */
387 if (ctx->offset > 0) {
389 ctx->offset = 0; /* having dumped it, we can re-use the memory */
391 if (!desc || !desc->empty) {
392 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name);
396 static void pstartElement(void *ctxt, const xmlChar *uname,
397 const xmlChar** uattrs)
406 size_t s_to, s_from, match;
408 saxctxt *ctx = (saxctxt*) ctxt;
410 ap_regmatch_t pmatch[10];
412 int verbose = APLOGrtrace1(ctx->f->r);
414 apr_array_header_t *linkattrs;
416 const char *name = (const char*) uname;
417 const char** attrs = (const char**) uattrs;
418 const htmlElemDesc* desc = htmlTagLookup(uname);
419 urlmap *themap = ctx->map;
424 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
426 if (!desc || desc->depr) {
427 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01416)
428 "Bogus HTML element %s dropped", name);
433 else if ((ctx->cfg->doctype == fpi_html_legacy)
434 || (ctx->cfg->doctype == fpi_xhtml_legacy)) {
435 /* enforce html legacy */
437 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01417)
438 "Deprecated HTML element %s dropped", name);
444 descp = apr_array_push(ctx->stack);
446 /* TODO - implement HTML "allowed here" */
449 ap_fputc(ctx->f->next, ctx->bb, '<');
450 ap_fputs(ctx->f->next, ctx->bb, name);
453 if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
454 for (a = desc->attrs_req; *a; a++)
458 linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING);
459 for (a = attrs; *a; a += 2) {
460 if (desc && enforce > 0) {
461 switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
463 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01418)
464 "Bogus HTML attribute %s of %s dropped",
467 case HTML_DEPRECATED:
468 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01419)
469 "Deprecated HTML attribute %s of %s dropped",
473 required_attrs--; /* cross off the number still needed */
474 /* fallthrough - required implies valid */
481 pappend(ctx, a[1], strlen(a[1])+1);
482 is_uri = ATTR_IGNORE;
484 tattr *attrs = (tattr*) linkattrs->elts;
485 for (i=0; i < linkattrs->nelts; ++i) {
486 if (!strcmp(*a, attrs[i].val)) {
492 if ((is_uri == ATTR_IGNORE) && ctx->cfg->extfix
493 && (ctx->cfg->events != NULL)) {
494 for (i=0; i < ctx->cfg->events->nelts; ++i) {
495 tattr *attrs = (tattr*) ctx->cfg->events->elts;
496 if (!strcmp(*a, attrs[i].val)) {
505 for (m = themap; m; m = m->next) {
506 if (!(m->flags & M_HTML))
508 if (m->flags & M_REGEX) {
510 if (!ap_regexec(m->from.r, ctx->buf, nmatch,
513 offs = match = pmatch[0].rm_so;
514 s_from = pmatch[0].rm_eo - match;
515 subs = ap_pregsub(ctx->f->r->pool, m->to,
516 ctx->buf, nmatch, pmatch);
519 f = apr_pstrndup(ctx->f->r->pool,
520 ctx->buf + offs, s_from);
521 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
523 "H/RX: match at %s, substituting %s",
527 len = strlen(ctx->buf);
529 preserve(ctx, s_to - s_from);
530 memmove(ctx->buf+offs+s_to,
531 ctx->buf+offs+s_from,
532 len + 1 - s_from - offs);
533 memcpy(ctx->buf+offs, subs, s_to);
536 memcpy(ctx->buf + offs, subs, s_to);
537 memmove(ctx->buf+offs+s_to,
538 ctx->buf+offs+s_from,
539 len + 1 - s_from - offs);
543 s_from = strlen(m->from.c);
544 if (!strncasecmp(ctx->buf, m->from.c, s_from)) {
546 s_to = strlen(m->to);
547 len = strlen(ctx->buf);
548 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
550 "H: matched %s, substituting %s",
553 preserve(ctx, s_to - s_from);
554 memmove(ctx->buf+s_to, ctx->buf+s_from,
556 memcpy(ctx->buf, m->to, s_to);
558 else { /* it fits in the existing space */
559 memcpy(ctx->buf, m->to, s_to);
560 memmove(ctx->buf+s_to, ctx->buf+s_from,
566 /* URIs only want one match unless overridden in the config */
567 if ((num_match > 0) && !(m->flags & M_NOTLAST))
572 for (m = themap; m; m = m->next) {
573 num_match = 0; /* reset here since we're working per-rule */
574 if (!(m->flags & M_EVENTS))
576 if (m->flags & M_REGEX) {
579 while (!ap_regexec(m->from.r, ctx->buf+offs,
580 nmatch, pmatch, 0)) {
581 match = pmatch[0].rm_so;
582 s_from = pmatch[0].rm_eo - match;
583 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
587 f = apr_pstrndup(ctx->f->r->pool,
588 ctx->buf + offs, s_from);
589 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
591 "E/RX: match at %s, substituting %s",
596 len = strlen(ctx->buf);
598 preserve(ctx, s_to - s_from);
599 memmove(ctx->buf+offs+s_to,
600 ctx->buf+offs+s_from,
601 len + 1 - s_from - offs);
602 memcpy(ctx->buf+offs, subs, s_to);
605 memcpy(ctx->buf + offs, subs, s_to);
606 memmove(ctx->buf+offs+s_to,
607 ctx->buf+offs+s_from,
608 len + 1 - s_from - offs);
615 found = strstr(ctx->buf, m->from.c);
616 if ((m->flags & M_ATSTART) && (found != ctx->buf))
619 s_from = strlen(m->from.c);
620 s_to = strlen(m->to);
621 match = found - ctx->buf;
622 if ((s_from < strlen(found))
623 && (m->flags & M_ATEND)) {
624 found = strstr(ctx->buf+match+s_from,
629 found = strstr(ctx->buf+match+s_to,
632 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
634 "E: matched %s, substituting %s",
636 len = strlen(ctx->buf);
638 preserve(ctx, s_to - s_from);
639 memmove(ctx->buf+match+s_to,
640 ctx->buf+match+s_from,
641 len + 1 - s_from - match);
642 memcpy(ctx->buf+match, m->to, s_to);
645 memcpy(ctx->buf+match, m->to, s_to);
646 memmove(ctx->buf+match+s_to,
647 ctx->buf+match+s_from,
648 len + 1 - s_from - match);
653 if (num_match && (m->flags & M_LAST))
662 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL);
665 if (ctx->cfg->flags != 0)
666 normalise(ctx->cfg->flags, ctx->buf);
668 /* write the attribute, using pcharacters to html-escape
669 anything that needs it in the value.
671 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL);
672 pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf));
673 ap_fputc(ctx->f->next, ctx->bb, '"');
678 if (desc && desc->empty)
679 ap_fputs(ctx->f->next, ctx->bb, ctx->etag);
681 ap_fputc(ctx->f->next, ctx->bb, '>');
683 if ((enforce > 0) && (required_attrs > 0)) {
684 /* if there are more required attributes than we found then complain */
685 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01420)
686 "HTML element %s is missing %d required attributes",
687 name, required_attrs);
691 static meta *metafix(request_rec *r, const char *buf, apr_size_t len)
699 ap_regmatch_t pmatch[2];
703 !ap_regexec_len(seek_meta, buf + offs, len - offs, 2, pmatch, 0)) {
706 p = buf+offs+pmatch[1].rm_eo;
707 while (!apr_isalpha(*++p));
708 for (q = p; apr_isalnum(*q) || (*q == '-'); ++q);
709 header = apr_pstrmemdup(r->pool, p, q-p);
710 if (!ap_cstr_casecmpn(header, "Content-Type", 12)) {
711 ret = apr_palloc(r->pool, sizeof(meta));
712 ret->start = offs+pmatch[0].rm_so;
713 ret->end = offs+pmatch[0].rm_eo;
716 /* find content=... string */
717 p = apr_strmatch(seek_content, buf+offs+pmatch[0].rm_so,
718 pmatch[0].rm_eo - pmatch[0].rm_so);
719 /* if it doesn't contain "content", ignore, don't crash! */
723 while (apr_isspace(*p))
725 /* XXX Should we search for another content= pattern? */
728 while (*p && apr_isspace(*++p));
729 if ((*p == '\'') || (*p == '"')) {
731 for (q = p; *q && *q != delim; ++q);
732 /* No terminating delimiter found? Skip the boggus directive */
736 for (q = p; *q && !apr_isspace(*q) && (*q != '>'); ++q);
738 content = apr_pstrmemdup(r->pool, p, q-p);
743 if (header && content) {
745 ap_log_rerror(APLOG_MARK, APLOG_TRACE2, 0, r,
746 "Adding header [%s: %s] from HTML META",
749 apr_table_setn(r->headers_out, header, content);
751 offs += pmatch[0].rm_eo;
756 static const char *interpolate_vars(request_rec *r, const char *str)
763 const char *replacement;
766 if ((start = ap_strstr_c(str, "${")) == NULL)
769 if ((end = ap_strchr_c(start+2, '}')) == NULL)
772 delim = ap_strchr_c(start+2, '|');
774 /* Restrict delim to ${...} */
775 if (delim && delim >= end) {
779 before = apr_pstrmemdup(r->pool, str, start-str);
782 var = apr_pstrmemdup(r->pool, start+2, delim-start-2);
785 var = apr_pstrmemdup(r->pool, start+2, end-start-2);
787 replacement = apr_table_get(r->subprocess_env, var);
790 replacement = apr_pstrmemdup(r->pool, delim+1, end-delim-1);
794 str = apr_pstrcat(r->pool, before, replacement, after, NULL);
795 ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
796 "Interpolating %s => %s", var, replacement);
800 static void fixup_rules(saxctxt *ctx)
805 request_rec *r = ctx->f->r;
807 for (p = ctx->cfg->map; p; p = p->next) {
808 if (p->cond != NULL) {
810 int ok = ap_expr_exec(r, p->cond, &err);
812 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01421)
813 "Error evaluating expr: %s", err);
816 continue; /* condition is unsatisfied */
820 newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
822 if (newp->flags & M_INTERPOLATE_FROM) {
823 newp->from.c = interpolate_vars(r, newp->from.c);
824 if (!newp->from.c || !*newp->from.c)
825 continue; /* don't use empty from-pattern */
826 if (newp->flags & M_REGEX) {
827 newp->from.r = ap_pregcomp(r->pool, newp->from.c,
831 if (newp->flags & M_INTERPOLATE_TO) {
832 newp->to = interpolate_vars(r, newp->to);
834 /* evaluate p->cond; continue if unsatisfied */
835 /* create new urlmap with memcpy and append to map */
836 /* interpolate from if flagged to do so */
837 /* interpolate to if flagged to do so */
850 static saxctxt *check_filter_init (ap_filter_t *f)
854 proxy_html_conf *cfg;
856 const char *errmsg = NULL;
857 cfg = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
858 force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
861 if (!f->r->proxyreq) {
862 errmsg = "Non-proxy request; not inserting proxy-html filter";
864 else if (!f->r->content_type) {
865 errmsg = "No content-type; bailing out of proxy-html filter";
867 else if (ap_cstr_casecmpn(f->r->content_type, "text/html", 9) &&
868 ap_cstr_casecmpn(f->r->content_type,
869 "application/xhtml+xml", 21)) {
870 errmsg = "Non-HTML content; not inserting proxy-html filter";
874 errmsg = "No links configured: nothing for proxy-html filter to do";
879 ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r, "%s", errmsg);
884 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt));
886 fctx->bb = apr_brigade_create(f->r->pool,
887 f->r->connection->bucket_alloc);
889 fctx->etag = cfg->etag;
890 apr_table_unset(f->r->headers_out, "Content-Length");
895 fctx->map = cfg->map;
896 /* defer dealing with charset_out until after sniffing charset_in
897 * so we can support setting one to t'other.
903 static void prepend_rbuf(saxctxt *ctxt, apr_bucket_brigade *bb)
906 apr_bucket *b = apr_bucket_transient_create(ctxt->rbuf,
909 APR_BRIGADE_INSERT_HEAD(bb, b);
914 static apr_status_t proxy_html_filter(ap_filter_t *f, apr_bucket_brigade *bb)
919 #ifndef USE_OLD_LIBXML2
920 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
921 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING;
924 saxctxt *ctxt = check_filter_init(f);
926 ap_remove_output_filter(f);
927 return ap_pass_brigade(f->next, bb);
930 while (!APR_BRIGADE_EMPTY(bb)) {
931 apr_bucket *b = APR_BRIGADE_FIRST(bb);
933 if (APR_BUCKET_IS_METADATA(b)) {
934 if (APR_BUCKET_IS_EOS(b)) {
935 if (ctxt->parser != NULL) {
936 consume_buffer(ctxt, "", 0, 1);
937 APR_BRIGADE_PREPEND(bb, ctxt->bb);
940 prepend_rbuf(ctxt, bb);
942 ap_remove_output_filter(f);
943 return ap_pass_brigade(f->next, bb);
945 else if (APR_BUCKET_IS_FLUSH(b)) {
946 /* pass on flush, except at start where it would cause
947 * headers to be sent before doc sniffing
949 if (ctxt->parser != NULL) {
950 ap_fflush(f->next, ctxt->bb);
951 apr_brigade_cleanup(ctxt->bb);
955 else if (apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
957 if (ctxt->parser == NULL) {
960 /* For documents smaller than four bytes, there is no reason to do
961 * HTML rewriting. The URL schema (i.e. 'http') needs four bytes alone.
962 * And the HTML parser needs at least four bytes to initialise correctly.
965 if (ctxt->rmin < sizeof(ctxt->rbuf)) {
966 memcpy(ctxt->rbuf + ctxt->rlen, buf, bytes);
968 apr_bucket_delete(b);
971 if (ctxt->rlen && ctxt->rlen < sizeof(ctxt->rbuf)) {
972 apr_size_t rem = sizeof(ctxt->rbuf) - ctxt->rlen;
973 memcpy(ctxt->rbuf + ctxt->rlen, buf, rem);
979 if (!xml2enc_charset ||
980 (xml2enc_charset(f->r, &enc, &cenc) != APR_SUCCESS)) {
981 if (!xml2enc_charset)
982 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01422)
983 "No i18n support found. Install mod_xml2enc if required");
984 enc = XML_CHAR_ENCODING_NONE;
985 ap_set_content_type(f->r, "text/html;charset=utf-8");
988 /* if we wanted a non-default charset_out, insert the
989 * xml2enc filter now that we've sniffed it
991 if (ctxt->cfg->charset_out && xml2enc_filter) {
992 if (*ctxt->cfg->charset_out != '*')
993 cenc = ctxt->cfg->charset_out;
994 xml2enc_filter(f->r, cenc, ENCIO_OUTPUT);
995 ap_set_content_type(f->r,
996 apr_pstrcat(f->r->pool,
997 "text/html;charset=",
1000 else /* Normal case, everything worked, utf-8 output */
1001 ap_set_content_type(f->r, "text/html;charset=utf-8");
1004 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype);
1007 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt,
1013 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf, 4,
1018 if (ctxt->parser == NULL) {
1019 prepend_rbuf(ctxt, bb);
1020 ap_remove_output_filter(f);
1021 return ap_pass_brigade(f->next, bb);
1024 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
1025 (int(*)(void*))htmlFreeParserCtxt,
1026 apr_pool_cleanup_null);
1027 #ifndef USE_OLD_LIBXML2
1028 if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts)
1029 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01423)
1030 "Unsupported parser opts %x", xmlopts);
1032 if (ctxt->cfg->metafix) {
1033 meta *m = metafix(f->r, buf, bytes);
1035 consume_buffer(ctxt, buf, m->start, 0);
1041 consume_buffer(ctxt, buf, bytes, 0);
1044 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01424)
1045 "Error in bucket read");
1048 apr_bucket_delete(b);
1050 #if 0 /* uncomment for debug */
1051 ap_fflush(f->next, ctxt->bb);
1052 apr_brigade_cleanup(ctxt->bb);
1057 static void *proxy_html_config(apr_pool_t *pool, char *x)
1059 proxy_html_conf *ret = apr_pcalloc(pool, sizeof(proxy_html_conf));
1060 ret->doctype = DEFAULT_DOCTYPE;
1061 ret->etag = DEFAULT_ETAG;
1063 /* ret->interp = 1; */
1064 /* don't initialise links and events until they get set/used */
1068 static void *proxy_html_merge(apr_pool_t *pool, void *BASE, void *ADD)
1070 proxy_html_conf *base = (proxy_html_conf *) BASE;
1071 proxy_html_conf *add = (proxy_html_conf *) ADD;
1072 proxy_html_conf *conf = apr_palloc(pool, sizeof(proxy_html_conf));
1074 /* don't merge declarations - just use the most specific */
1075 conf->links = (add->links == NULL) ? base->links : add->links;
1076 conf->events = (add->events == NULL) ? base->events : add->events;
1078 conf->charset_out = (add->charset_out == NULL)
1079 ? base->charset_out : add->charset_out;
1081 if (add->map && base->map) {
1084 for (a = base->map; a; a = a->next) {
1085 urlmap *save = conf->map;
1086 conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
1087 conf->map->next = save;
1089 for (a = add->map; a; a = a->next) {
1090 urlmap *save = conf->map;
1091 conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
1092 conf->map->next = save;
1096 conf->map = add->map ? add->map : base->map;
1098 conf->doctype = (add->doctype == DEFAULT_DOCTYPE)
1099 ? base->doctype : add->doctype;
1100 conf->etag = (add->etag == DEFAULT_ETAG) ? base->etag : add->etag;
1101 conf->bufsz = add->bufsz;
1102 if (add->flags & NORM_RESET) {
1103 conf->flags = add->flags ^ NORM_RESET;
1104 conf->metafix = add->metafix;
1105 conf->extfix = add->extfix;
1106 conf->interp = add->interp;
1107 conf->strip_comments = add->strip_comments;
1108 conf->enabled = add->enabled;
1111 conf->flags = base->flags | add->flags;
1112 conf->metafix = base->metafix | add->metafix;
1113 conf->extfix = base->extfix | add->extfix;
1114 conf->interp = base->interp | add->interp;
1115 conf->strip_comments = base->strip_comments | add->strip_comments;
1116 conf->enabled = add->enabled | base->enabled;
1120 #define REGFLAG(n,s,c) ((s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0)
1121 #define XREGFLAG(n,s,c) ((!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0)
1122 static const char *comp_urlmap(cmd_parms *cmd, urlmap *newmap,
1123 const char *from, const char *to,
1124 const char *flags, const char *cond)
1126 const char *err = NULL;
1128 = XREGFLAG(M_HTML,flags,'h')
1129 | XREGFLAG(M_EVENTS,flags,'e')
1130 | XREGFLAG(M_CDATA,flags,'c')
1131 | REGFLAG(M_ATSTART,flags,'^')
1132 | REGFLAG(M_ATEND,flags,'$')
1133 | REGFLAG(M_REGEX,flags,'R')
1134 | REGFLAG(M_LAST,flags,'L')
1135 | REGFLAG(M_NOTLAST,flags,'l')
1136 | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1137 | REGFLAG(M_INTERPOLATE_FROM,flags,'v');
1139 if ((newmap->flags & M_INTERPOLATE_FROM) || !(newmap->flags & M_REGEX)) {
1140 newmap->from.c = from;
1145 = REGFLAG(AP_REG_EXTENDED,flags,'x')
1146 | REGFLAG(AP_REG_ICASE,flags,'i')
1147 | REGFLAG(AP_REG_NOSUB,flags,'n')
1148 | REGFLAG(AP_REG_NEWLINE,flags,'s');
1149 newmap->from.r = ap_pregcomp(cmd->pool, from, newmap->regflags);
1153 /* back-compatibility: support old-style ENV expressions
1154 * by converting to ap_expr syntax.
1156 * 1. var --> env(var)
1157 * 2. var=val --> env(var)=val
1158 * 3. !var --> !env(var)
1159 * 4. !var=val --> env(var)!=val
1161 char *newcond = NULL;
1162 if (ap_rxplus_exec(cmd->temp_pool, old_expr, cond, &newcond)) {
1163 /* we got a substitution. Check for the case (3) above
1164 * that the regexp gets wrong: a negation without a comparison.
1166 if ((cond[0] == '!') && !ap_strchr_c(cond, '=')) {
1167 memmove(newcond+1, newcond, strlen(newcond)-1);
1172 newmap->cond = ap_expr_parse_cmd(cmd, cond, 0, &err, NULL);
1175 newmap->cond = NULL;
1180 static const char *set_urlmap(cmd_parms *cmd, void *CFG, const char *args)
1182 proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1184 apr_pool_t *pool = cmd->pool;
1187 "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1191 const char *cond = NULL;
1193 if (from = ap_getword_conf(cmd->pool, &args), !from)
1195 if (to = ap_getword_conf(cmd->pool, &args), !to)
1197 flags = ap_getword_conf(cmd->pool, &args);
1198 if (flags && *flags)
1199 cond = ap_getword_conf(cmd->pool, &args);
1203 /* the args look OK, so let's use them */
1204 newmap = apr_palloc(pool, sizeof(urlmap));
1205 newmap->next = NULL;
1207 for (map = cfg->map; map->next; map = map->next);
1213 return comp_urlmap(cmd, newmap, from, to, flags, cond);
1216 static const char *set_doctype(cmd_parms *cmd, void *CFG,
1217 const char *t, const char *l)
1219 proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1220 if (!strcasecmp(t, "auto")) {
1221 cfg->doctype = DEFAULT_DOCTYPE; /* activates pinternalSubset */
1223 else if (!strcasecmp(t, "xhtml")) {
1224 cfg->etag = xhtml_etag;
1225 if (l && !strcasecmp(l, "legacy"))
1226 cfg->doctype = fpi_xhtml_legacy;
1228 cfg->doctype = fpi_xhtml;
1230 else if (!strcasecmp(t, "html")) {
1231 cfg->etag = html_etag;
1232 if (l && !strcasecmp(l, "legacy"))
1233 cfg->doctype = fpi_html_legacy;
1235 cfg->doctype = fpi_html;
1237 else if (!strcasecmp(t, "html5")) {
1238 cfg->etag = html_etag;
1239 cfg->doctype = fpi_html5;
1243 if (l && ((l[0] == 'x') || (l[0] == 'X')))
1244 cfg->etag = xhtml_etag;
1246 cfg->etag = html_etag;
1251 static const char *set_flags(cmd_parms *cmd, void *CFG, const char *arg)
1253 proxy_html_conf *cfg = CFG;
1255 if (!strcasecmp(arg, "lowercase"))
1256 cfg->flags |= NORM_LC;
1257 else if (!strcasecmp(arg, "dospath"))
1258 cfg->flags |= NORM_MSSLASH;
1259 else if (!strcasecmp(arg, "reset"))
1260 cfg->flags |= NORM_RESET;
1265 static const char *set_events(cmd_parms *cmd, void *CFG, const char *arg)
1268 proxy_html_conf *cfg = CFG;
1269 if (cfg->events == NULL)
1270 cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1271 attr = apr_array_push(cfg->events);
1276 static const char *set_links(cmd_parms *cmd, void *CFG,
1277 const char *elt, const char *att)
1279 apr_array_header_t *attrs;
1281 proxy_html_conf *cfg = CFG;
1283 if (cfg->links == NULL)
1284 cfg->links = apr_hash_make(cmd->pool);
1286 attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING);
1288 attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*));
1289 apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs);
1291 attr = apr_array_push(attrs);
1295 static const command_rec proxy_html_cmds[] = {
1296 AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1297 RSRC_CONF|ACCESS_CONF,
1298 "Strings to be treated as scripting events"),
1299 AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1300 RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1301 AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1302 RSRC_CONF|ACCESS_CONF, "Map URL From To"),
1303 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1304 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]"),
1305 AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1306 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath"),
1307 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1308 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1309 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements"),
1310 AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1311 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1312 RSRC_CONF|ACCESS_CONF,
1313 "Support interpolation and conditions in URLMaps"),
1314 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1315 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1316 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS"),
1317 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1318 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1319 RSRC_CONF|ACCESS_CONF, "Strip out comments"),
1320 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1321 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1322 RSRC_CONF|ACCESS_CONF, "Buffer size"),
1323 AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1324 (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1325 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset"),
1326 AP_INIT_FLAG("ProxyHTMLEnable", ap_set_flag_slot,
1327 (void*)APR_OFFSETOF(proxy_html_conf, enabled),
1328 RSRC_CONF|ACCESS_CONF,
1329 "Enable proxy-html and xml2enc filters"),
1332 static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2)
1334 seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1335 AP_REG_EXTENDED|AP_REG_ICASE);
1336 seek_content = apr_strmatch_precompile(p, "content", 0);
1337 memset(&sax, 0, sizeof(htmlSAXHandler));
1338 sax.startElement = pstartElement;
1339 sax.endElement = pendElement;
1340 sax.characters = pcharacters;
1341 sax.comment = pcomment;
1342 sax.cdataBlock = pcdata;
1343 sax.internalSubset = pinternalSubset;
1344 xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
1345 xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
1346 if (!xml2enc_charset) {
1347 ap_log_perror(APLOG_MARK, APLOG_NOTICE, 0, p2, APLOGNO(01425)
1348 "I18n support in mod_proxy_html requires mod_xml2enc. "
1349 "Without it, non-ASCII characters in proxied pages are "
1350 "likely to display incorrectly.");
1353 /* old_expr only needs to last the life of the config phase */
1354 old_expr = ap_rxplus_compile(p1, "s/^(!)?(\\w+)((=)(.+))?$/reqenv('$2')$1$4'$5'/");
1357 static void proxy_html_insert(request_rec *r)
1359 proxy_html_conf *cfg;
1360 cfg = ap_get_module_config(r->per_dir_config, &proxy_html_module);
1363 xml2enc_filter(r, NULL, ENCIO_INPUT_CHECKS);
1364 ap_add_output_filter("proxy-html", NULL, r, r->connection);
1367 static void proxy_html_hooks(apr_pool_t *p)
1369 static const char *aszSucc[] = { "mod_filter.c", NULL };
1370 ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1371 NULL, AP_FTYPE_RESOURCE,
1372 AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH);
1373 /* move this to pre_config so old_expr is available to interpret
1374 * old-style conditions on URL maps.
1376 ap_hook_pre_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE);
1377 ap_hook_insert_filter(proxy_html_insert, NULL, aszSucc, APR_HOOK_MIDDLE);
1380 AP_DECLARE_MODULE(proxy_html) = {
1381 STANDARD20_MODULE_STUFF,