1 /* Copyright (c) 2003-11, WebThing Ltd
2 * Copyright (c) 2011-, The Apache Software Foundation
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 You can #define GO_FASTER to disable trace logging.
28 #define VERBOSE(x) if (verbose) x
29 #define VERBOSEB(x) if (verbose) {x}
33 #include <libxml/HTMLparser.h>
35 #include "http_protocol.h"
36 #include "http_config.h"
38 #include "apr_strings.h"
40 #include "apr_strmatch.h"
43 #include "apr_optional.h"
44 #include "mod_xml2enc.h"
45 #include "http_request.h"
48 /* globals set once at startup */
49 static ap_rxplus_t *old_expr;
50 static ap_regex_t *seek_meta;
51 static const apr_strmatch_pattern* seek_content;
52 static apr_status_t (*xml2enc_charset)(request_rec*, xmlCharEncoding*, const char**) = NULL;
53 static apr_status_t (*xml2enc_filter)(request_rec*, const char*, unsigned int) = NULL;
55 module AP_MODULE_DECLARE_DATA proxy_html_module;
61 #define M_ATSTART 0x10
64 #define M_NOTLAST 0x80
65 #define M_INTERPOLATE_TO 0x100
66 #define M_INTERPOLATE_FROM 0x200
75 typedef struct urlmap {
78 unsigned int regflags;
93 apr_array_header_t *events;
94 const char *charset_out;
103 proxy_html_conf *cfg;
104 htmlParserCtxtPtr parser;
105 apr_bucket_brigade *bb;
109 const char *encoding;
116 #define NORM_MSSLASH 0x2
117 #define NORM_RESET 0x4
118 static htmlSAXHandler sax;
120 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t;
122 static const char *const fpi_html =
123 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n";
124 static const char *const fpi_html_legacy =
125 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n";
126 static const char *const fpi_xhtml =
127 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
128 static const char *const fpi_xhtml_legacy =
129 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
130 static const char *const fpi_html5 = "<!DOCTYPE html>\n";
131 static const char *const html_etag = ">";
132 static const char *const xhtml_etag = " />";
133 /*#define DEFAULT_DOCTYPE fpi_html */
134 static const char *const DEFAULT_DOCTYPE = "";
135 #define DEFAULT_ETAG html_etag
137 static void normalise(unsigned int flags, char *str)
141 for (p = str; *p; ++p)
145 if (flags & NORM_MSSLASH)
146 for (p = ap_strchr(str, '\\'); p; p = ap_strchr(p+1, '\\'))
150 #define consume_buffer(ctx,inbuf,bytes,flag) \
151 htmlParseChunk(ctx->parser, inbuf, bytes, flag)
153 #define AP_fwrite(ctx,inbuf,bytes,flush) \
154 ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
156 /* This is always utf-8 on entry. We can convert charset within FLUSH */
157 #define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0); begin = i+1
158 static void pcharacters(void *ctxt, const xmlChar *uchars, int length)
160 const char *chars = (const char*) uchars;
161 saxctxt *ctx = (saxctxt*) ctxt;
164 for (begin=i=0; i<length; i++) {
166 case '&' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&"); break;
167 case '<' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "<"); break;
168 case '>' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, ">"); break;
169 case '"' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, """); break;
176 static void preserve(saxctxt *ctx, const size_t len)
179 if (len <= (ctx->avail - ctx->offset))
181 else while (len > (ctx->avail - ctx->offset))
182 ctx->avail += ctx->cfg->bufsz;
184 newbuf = realloc(ctx->buf, ctx->avail);
185 if (newbuf != ctx->buf) {
187 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf,
188 (int(*)(void*))free);
189 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
190 (int(*)(void*))free, apr_pool_cleanup_null);
195 static void pappend(saxctxt *ctx, const char *buf, const size_t len)
198 memcpy(ctx->buf+ctx->offset, buf, len);
202 static void dump_content(saxctxt *ctx)
210 ap_regmatch_t pmatch[10];
213 urlmap *themap = ctx->map;
215 int verbose = APLOGrtrace1(ctx->f->r);
218 pappend(ctx, &c, 1); /* append null byte */
219 /* parse the text for URLs */
220 for (m = themap; m; m = m->next) {
221 if (!(m->flags & M_CDATA))
223 if (m->flags & M_REGEX) {
226 while (!ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0)) {
227 match = pmatch[0].rm_so;
228 s_from = pmatch[0].rm_eo - match;
229 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
232 len = strlen(ctx->buf);
235 const char *f = apr_pstrndup(ctx->f->r->pool,
236 ctx->buf + offs, s_from);
237 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
238 "C/RX: match at %s, substituting %s", f, subs);
241 preserve(ctx, s_to - s_from);
242 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
243 len + 1 - s_from - offs);
244 memcpy(ctx->buf+offs, subs, s_to);
247 memcpy(ctx->buf + offs, subs, s_to);
248 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
249 len + 1 - s_from - offs);
255 s_from = strlen(m->from.c);
256 s_to = strlen(m->to);
257 for (found = strstr(ctx->buf, m->from.c); found;
258 found = strstr(ctx->buf+match+s_to, m->from.c)) {
259 match = found - ctx->buf;
260 if ((m->flags & M_ATSTART) && (match != 0))
262 len = strlen(ctx->buf);
263 if ((m->flags & M_ATEND) && (match < (len - s_from)))
265 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
266 "C: matched %s, substituting %s",
269 preserve(ctx, s_to - s_from);
270 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
271 len + 1 - s_from - match);
272 memcpy(ctx->buf+match, m->to, s_to);
275 memcpy(ctx->buf+match, m->to, s_to);
276 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
277 len + 1 - s_from - match);
282 AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1);
284 static void pinternalSubset(void* ctxt, const xmlChar *name,
285 const xmlChar *externalID, const xmlChar *sysID)
287 saxctxt* ctx = (saxctxt*) ctxt;
288 if (!ctxt || !name) {
292 if (ctx->cfg->doctype != DEFAULT_DOCTYPE) {
293 /* do nothing if overridden in config */
296 ap_fputstrs(ctx->f->next, ctx->bb, "<!DOCTYPE ", (const char *)name, NULL);
298 if (!ap_cstr_casecmp((const char*)name, "html") &&
299 !ap_cstr_casecmpn((const char *)externalID, "-//W3C//DTD XHTML ", 18)) {
300 ctx->etag = xhtml_etag;
303 ctx->etag = html_etag;
305 ap_fputstrs(ctx->f->next, ctx->bb, " PUBLIC \"", (const char *)externalID, "\"", NULL);
307 ap_fputstrs(ctx->f->next, ctx->bb, " \"", (const char *)sysID, "\"", NULL);
309 ap_fputs(ctx->f->next, ctx->bb, ">\n");
311 static void pcdata(void *ctxt, const xmlChar *uchars, int length)
313 const char *chars = (const char*) uchars;
314 saxctxt *ctx = (saxctxt*) ctxt;
315 if (ctx->cfg->extfix) {
316 pappend(ctx, chars, length);
319 /* not sure if this should force-flush
320 * (i.e. can one cdata section come in multiple calls?)
322 AP_fwrite(ctx, chars, length, 0);
325 static void pcomment(void *ctxt, const xmlChar *uchars)
327 const char *chars = (const char*) uchars;
328 saxctxt *ctx = (saxctxt*) ctxt;
329 if (ctx->cfg->strip_comments)
332 if (ctx->cfg->extfix) {
333 pappend(ctx, "<!--", 4);
334 pappend(ctx, chars, strlen(chars));
335 pappend(ctx, "-->", 3);
338 ap_fputs(ctx->f->next, ctx->bb, "<!--");
339 AP_fwrite(ctx, chars, strlen(chars), 1);
340 ap_fputs(ctx->f->next, ctx->bb, "-->");
344 static void pendElement(void *ctxt, const xmlChar *uname)
346 saxctxt *ctx = (saxctxt*) ctxt;
347 const char *name = (const char*) uname;
348 const htmlElemDesc* desc = htmlTagLookup(uname);
350 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
352 if (!desc || desc->depr)
356 else if ((ctx->cfg->doctype == fpi_html_legacy)
357 || (ctx->cfg->doctype == fpi_xhtml_legacy)) {
358 /* enforce html legacy */
362 /* TODO - implement HTML "allowed here" using the stack */
363 /* nah. Keeping the stack is too much overhead */
365 if (ctx->offset > 0) {
367 ctx->offset = 0; /* having dumped it, we can re-use the memory */
369 if (!desc || !desc->empty) {
370 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name);
374 static void pstartElement(void *ctxt, const xmlChar *uname,
375 const xmlChar** uattrs)
384 size_t s_to, s_from, match;
386 saxctxt *ctx = (saxctxt*) ctxt;
388 ap_regmatch_t pmatch[10];
390 int verbose = APLOGrtrace1(ctx->f->r);
392 apr_array_header_t *linkattrs;
394 const char *name = (const char*) uname;
395 const char** attrs = (const char**) uattrs;
396 const htmlElemDesc* desc = htmlTagLookup(uname);
397 urlmap *themap = ctx->map;
402 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
405 if (!desc || desc->depr)
409 else if ((ctx->cfg->doctype == fpi_html)
410 || (ctx->cfg->doctype == fpi_xhtml)) {
412 /* enforce html legacy */
417 if (!desc && enforce) {
418 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01416)
419 "Bogus HTML element %s dropped", name);
422 if (desc && desc->depr && (enforce == 2)) {
423 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01417)
424 "Deprecated HTML element %s dropped", name);
428 descp = apr_array_push(ctx->stack);
430 /* TODO - implement HTML "allowed here" */
433 ap_fputc(ctx->f->next, ctx->bb, '<');
434 ap_fputs(ctx->f->next, ctx->bb, name);
437 if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
438 for (a = desc->attrs_req; *a; a++)
442 linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING);
443 for (a = attrs; *a; a += 2) {
444 if (desc && enforce > 0) {
445 switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
447 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01418)
448 "Bogus HTML attribute %s of %s dropped",
451 case HTML_DEPRECATED:
452 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01419)
453 "Deprecated HTML attribute %s of %s dropped",
457 required_attrs--; /* cross off the number still needed */
458 /* fallthrough - required implies valid */
465 pappend(ctx, a[1], strlen(a[1])+1);
466 is_uri = ATTR_IGNORE;
468 tattr *attrs = (tattr*) linkattrs->elts;
469 for (i=0; i < linkattrs->nelts; ++i) {
470 if (!strcmp(*a, attrs[i].val)) {
476 if ((is_uri == ATTR_IGNORE) && ctx->cfg->extfix
477 && (ctx->cfg->events != NULL)) {
478 for (i=0; i < ctx->cfg->events->nelts; ++i) {
479 tattr *attrs = (tattr*) ctx->cfg->events->elts;
480 if (!strcmp(*a, attrs[i].val)) {
489 for (m = themap; m; m = m->next) {
490 if (!(m->flags & M_HTML))
492 if (m->flags & M_REGEX) {
494 if (!ap_regexec(m->from.r, ctx->buf, nmatch,
497 offs = match = pmatch[0].rm_so;
498 s_from = pmatch[0].rm_eo - match;
499 subs = ap_pregsub(ctx->f->r->pool, m->to,
500 ctx->buf, nmatch, pmatch);
503 f = apr_pstrndup(ctx->f->r->pool,
504 ctx->buf + offs, s_from);
505 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
507 "H/RX: match at %s, substituting %s",
511 len = strlen(ctx->buf);
513 preserve(ctx, s_to - s_from);
514 memmove(ctx->buf+offs+s_to,
515 ctx->buf+offs+s_from,
516 len + 1 - s_from - offs);
517 memcpy(ctx->buf+offs, subs, s_to);
520 memcpy(ctx->buf + offs, subs, s_to);
521 memmove(ctx->buf+offs+s_to,
522 ctx->buf+offs+s_from,
523 len + 1 - s_from - offs);
527 s_from = strlen(m->from.c);
528 if (!strncasecmp(ctx->buf, m->from.c, s_from)) {
530 s_to = strlen(m->to);
531 len = strlen(ctx->buf);
532 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
534 "H: matched %s, substituting %s",
537 preserve(ctx, s_to - s_from);
538 memmove(ctx->buf+s_to, ctx->buf+s_from,
540 memcpy(ctx->buf, m->to, s_to);
542 else { /* it fits in the existing space */
543 memcpy(ctx->buf, m->to, s_to);
544 memmove(ctx->buf+s_to, ctx->buf+s_from,
550 /* URIs only want one match unless overridden in the config */
551 if ((num_match > 0) && !(m->flags & M_NOTLAST))
556 for (m = themap; m; m = m->next) {
557 num_match = 0; /* reset here since we're working per-rule */
558 if (!(m->flags & M_EVENTS))
560 if (m->flags & M_REGEX) {
563 while (!ap_regexec(m->from.r, ctx->buf+offs,
564 nmatch, pmatch, 0)) {
565 match = pmatch[0].rm_so;
566 s_from = pmatch[0].rm_eo - match;
567 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
571 f = apr_pstrndup(ctx->f->r->pool,
572 ctx->buf + offs, s_from);
573 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
575 "E/RX: match at %s, substituting %s",
580 len = strlen(ctx->buf);
582 preserve(ctx, s_to - s_from);
583 memmove(ctx->buf+offs+s_to,
584 ctx->buf+offs+s_from,
585 len + 1 - s_from - offs);
586 memcpy(ctx->buf+offs, subs, s_to);
589 memcpy(ctx->buf + offs, subs, s_to);
590 memmove(ctx->buf+offs+s_to,
591 ctx->buf+offs+s_from,
592 len + 1 - s_from - offs);
599 found = strstr(ctx->buf, m->from.c);
600 if ((m->flags & M_ATSTART) && (found != ctx->buf))
603 s_from = strlen(m->from.c);
604 s_to = strlen(m->to);
605 match = found - ctx->buf;
606 if ((s_from < strlen(found))
607 && (m->flags & M_ATEND)) {
608 found = strstr(ctx->buf+match+s_from,
613 found = strstr(ctx->buf+match+s_to,
616 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
618 "E: matched %s, substituting %s",
620 len = strlen(ctx->buf);
622 preserve(ctx, s_to - s_from);
623 memmove(ctx->buf+match+s_to,
624 ctx->buf+match+s_from,
625 len + 1 - s_from - match);
626 memcpy(ctx->buf+match, m->to, s_to);
629 memcpy(ctx->buf+match, m->to, s_to);
630 memmove(ctx->buf+match+s_to,
631 ctx->buf+match+s_from,
632 len + 1 - s_from - match);
637 if (num_match && (m->flags & M_LAST))
646 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL);
649 if (ctx->cfg->flags != 0)
650 normalise(ctx->cfg->flags, ctx->buf);
652 /* write the attribute, using pcharacters to html-escape
653 anything that needs it in the value.
655 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL);
656 pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf));
657 ap_fputc(ctx->f->next, ctx->bb, '"');
662 if (desc && desc->empty)
663 ap_fputs(ctx->f->next, ctx->bb, ctx->etag);
665 ap_fputc(ctx->f->next, ctx->bb, '>');
667 if ((enforce > 0) && (required_attrs > 0)) {
668 /* if there are more required attributes than we found then complain */
669 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01420)
670 "HTML element %s is missing %d required attributes",
671 name, required_attrs);
675 static meta *metafix(request_rec *r, const char *buf)
683 ap_regmatch_t pmatch[2];
686 while (!ap_regexec(seek_meta, buf+offs, 2, pmatch, 0)) {
689 p = buf+offs+pmatch[1].rm_eo;
690 while (!apr_isalpha(*++p));
691 for (q = p; apr_isalnum(*q) || (*q == '-'); ++q);
692 header = apr_pstrndup(r->pool, p, q-p);
693 if (ap_cstr_casecmpn(header, "Content-", 8)) {
694 /* find content=... string */
695 p = apr_strmatch(seek_content, buf+offs+pmatch[0].rm_so,
696 pmatch[0].rm_eo - pmatch[0].rm_so);
697 /* if it doesn't contain "content", ignore, don't crash! */
701 while (apr_isspace(*p))
703 /* XXX Should we search for another content= pattern? */
706 while (*p && apr_isspace(*++p));
707 if ((*p == '\'') || (*p == '"')) {
709 for (q = p; *q && *q != delim; ++q);
710 /* No terminating delimiter found? Skip the boggus directive */
714 for (q = p; *q && !apr_isspace(*q) && (*q != '>'); ++q);
716 content = apr_pstrndup(r->pool, p, q-p);
721 else if (!ap_cstr_casecmpn(header, "Content-Type", 12)) {
722 ret = apr_palloc(r->pool, sizeof(meta));
723 ret->start = offs+pmatch[0].rm_so;
724 ret->end = offs+pmatch[0].rm_eo;
726 if (header && content) {
728 ap_log_rerror(APLOG_MARK, APLOG_TRACE2, 0, r,
729 "Adding header [%s: %s] from HTML META",
732 apr_table_setn(r->headers_out, header, content);
734 offs += pmatch[0].rm_eo;
739 static const char *interpolate_vars(request_rec *r, const char *str)
746 const char *replacement;
750 if (start = ap_strstr_c(start, "${"), start == NULL)
753 if (end = ap_strchr_c(start+2, '}'), end == NULL)
756 delim = ap_strchr_c(start, '|');
757 before = apr_pstrndup(r->pool, str, start-str);
760 var = apr_pstrndup(r->pool, start+2, delim-start-2);
763 var = apr_pstrndup(r->pool, start+2, end-start-2);
765 replacement = apr_table_get(r->subprocess_env, var);
768 replacement = apr_pstrndup(r->pool, delim+1, end-delim-1);
772 str = apr_pstrcat(r->pool, before, replacement, after, NULL);
773 ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
774 "Interpolating %s => %s", var, replacement);
778 static void fixup_rules(saxctxt *ctx)
783 request_rec *r = ctx->f->r;
785 for (p = ctx->cfg->map; p; p = p->next) {
786 if (p->cond != NULL) {
788 int ok = ap_expr_exec(r, p->cond, &err);
790 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01421)
791 "Error evaluating expr: %s", err);
794 continue; /* condition is unsatisfied */
798 newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
800 if (newp->flags & M_INTERPOLATE_FROM) {
801 newp->from.c = interpolate_vars(r, newp->from.c);
802 if (!newp->from.c || !*newp->from.c)
803 continue; /* don't use empty from-pattern */
804 if (newp->flags & M_REGEX) {
805 newp->from.r = ap_pregcomp(r->pool, newp->from.c,
809 if (newp->flags & M_INTERPOLATE_TO) {
810 newp->to = interpolate_vars(r, newp->to);
812 /* evaluate p->cond; continue if unsatisfied */
813 /* create new urlmap with memcpy and append to map */
814 /* interpolate from if flagged to do so */
815 /* interpolate to if flagged to do so */
828 static saxctxt *check_filter_init (ap_filter_t *f)
832 proxy_html_conf *cfg;
834 const char *errmsg = NULL;
835 cfg = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
836 force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
839 if (!f->r->proxyreq) {
840 errmsg = "Non-proxy request; not inserting proxy-html filter";
842 else if (!f->r->content_type) {
843 errmsg = "No content-type; bailing out of proxy-html filter";
845 else if (ap_cstr_casecmpn(f->r->content_type, "text/html", 9) &&
846 ap_cstr_casecmpn(f->r->content_type,
847 "application/xhtml+xml", 21)) {
848 errmsg = "Non-HTML content; not inserting proxy-html filter";
852 errmsg = "No links configured: nothing for proxy-html filter to do";
857 ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r, "%s", errmsg);
859 ap_remove_output_filter(f);
863 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt));
865 fctx->bb = apr_brigade_create(f->r->pool,
866 f->r->connection->bucket_alloc);
868 fctx->etag = cfg->etag;
869 apr_table_unset(f->r->headers_out, "Content-Length");
874 fctx->map = cfg->map;
875 /* defer dealing with charset_out until after sniffing charset_in
876 * so we can support setting one to t'other.
882 static apr_status_t proxy_html_filter(ap_filter_t *f, apr_bucket_brigade *bb)
888 apr_size_t bytes = 0;
889 #ifndef USE_OLD_LIBXML2
890 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
891 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING;
894 saxctxt *ctxt = check_filter_init(f);
896 return ap_pass_brigade(f->next, bb);
897 for (b = APR_BRIGADE_FIRST(bb);
898 b != APR_BRIGADE_SENTINEL(bb);
899 b = APR_BUCKET_NEXT(b)) {
900 if (APR_BUCKET_IS_METADATA(b)) {
901 if (APR_BUCKET_IS_EOS(b)) {
902 if (ctxt->parser != NULL) {
903 consume_buffer(ctxt, buf, 0, 1);
905 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
906 apr_bucket_eos_create(ctxt->bb->bucket_alloc));
907 ap_pass_brigade(ctxt->f->next, ctxt->bb);
909 else if (APR_BUCKET_IS_FLUSH(b)) {
910 /* pass on flush, except at start where it would cause
911 * headers to be sent before doc sniffing
913 if (ctxt->parser != NULL) {
914 ap_fflush(ctxt->f->next, ctxt->bb);
918 else if (apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
920 if (ctxt->parser == NULL) {
923 /* For documents smaller than four bytes, there is no reason to do
924 * HTML rewriting. The URL schema (i.e. 'http') needs four bytes alone.
925 * And the HTML parser needs at least four bytes to initialise correctly.
927 if ((bytes < 4) && APR_BUCKET_IS_EOS(APR_BUCKET_NEXT(b))) {
928 ap_remove_output_filter(f) ;
929 return ap_pass_brigade(f->next, bb) ;
932 if (!xml2enc_charset ||
933 (xml2enc_charset(f->r, &enc, &cenc) != APR_SUCCESS)) {
934 if (!xml2enc_charset)
935 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01422)
936 "No i18n support found. Install mod_xml2enc if required");
937 enc = XML_CHAR_ENCODING_NONE;
938 ap_set_content_type(f->r, "text/html;charset=utf-8");
941 /* if we wanted a non-default charset_out, insert the
942 * xml2enc filter now that we've sniffed it
944 if (ctxt->cfg->charset_out && xml2enc_filter) {
945 if (*ctxt->cfg->charset_out != '*')
946 cenc = ctxt->cfg->charset_out;
947 xml2enc_filter(f->r, cenc, ENCIO_OUTPUT);
948 ap_set_content_type(f->r,
949 apr_pstrcat(f->r->pool,
950 "text/html;charset=",
953 else /* Normal case, everything worked, utf-8 output */
954 ap_set_content_type(f->r, "text/html;charset=utf-8");
957 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype);
958 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf,
962 if (ctxt->parser == NULL) {
963 apr_status_t rv = ap_pass_brigade(f->next, bb);
964 ap_remove_output_filter(f);
967 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
968 (int(*)(void*))htmlFreeParserCtxt,
969 apr_pool_cleanup_null);
970 #ifndef USE_OLD_LIBXML2
971 if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts)
972 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01423)
973 "Unsupported parser opts %x", xmlopts);
975 if (ctxt->cfg->metafix)
976 m = metafix(f->r, buf);
978 consume_buffer(ctxt, buf, m->start, 0);
979 consume_buffer(ctxt, buf+m->end, bytes-m->end, 0);
982 consume_buffer(ctxt, buf, bytes, 0);
986 consume_buffer(ctxt, buf, bytes, 0);
990 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01424)
991 "Error in bucket read");
994 /*ap_fflush(ctxt->f->next, ctxt->bb); // uncomment for debug */
995 apr_brigade_cleanup(bb);
999 static void *proxy_html_config(apr_pool_t *pool, char *x)
1001 proxy_html_conf *ret = apr_pcalloc(pool, sizeof(proxy_html_conf));
1002 ret->doctype = DEFAULT_DOCTYPE;
1003 ret->etag = DEFAULT_ETAG;
1005 /* ret->interp = 1; */
1006 /* don't initialise links and events until they get set/used */
1010 static void *proxy_html_merge(apr_pool_t *pool, void *BASE, void *ADD)
1012 proxy_html_conf *base = (proxy_html_conf *) BASE;
1013 proxy_html_conf *add = (proxy_html_conf *) ADD;
1014 proxy_html_conf *conf = apr_palloc(pool, sizeof(proxy_html_conf));
1016 /* don't merge declarations - just use the most specific */
1017 conf->links = (add->links == NULL) ? base->links : add->links;
1018 conf->events = (add->events == NULL) ? base->events : add->events;
1020 conf->charset_out = (add->charset_out == NULL)
1021 ? base->charset_out : add->charset_out;
1023 if (add->map && base->map) {
1026 for (a = base->map; a; a = a->next) {
1027 urlmap *save = conf->map;
1028 conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
1029 conf->map->next = save;
1031 for (a = add->map; a; a = a->next) {
1032 urlmap *save = conf->map;
1033 conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
1034 conf->map->next = save;
1038 conf->map = add->map ? add->map : base->map;
1040 conf->doctype = (add->doctype == DEFAULT_DOCTYPE)
1041 ? base->doctype : add->doctype;
1042 conf->etag = (add->etag == DEFAULT_ETAG) ? base->etag : add->etag;
1043 conf->bufsz = add->bufsz;
1044 if (add->flags & NORM_RESET) {
1045 conf->flags = add->flags ^ NORM_RESET;
1046 conf->metafix = add->metafix;
1047 conf->extfix = add->extfix;
1048 conf->interp = add->interp;
1049 conf->strip_comments = add->strip_comments;
1050 conf->enabled = add->enabled;
1053 conf->flags = base->flags | add->flags;
1054 conf->metafix = base->metafix | add->metafix;
1055 conf->extfix = base->extfix | add->extfix;
1056 conf->interp = base->interp | add->interp;
1057 conf->strip_comments = base->strip_comments | add->strip_comments;
1058 conf->enabled = add->enabled | base->enabled;
1062 #define REGFLAG(n,s,c) ((s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0)
1063 #define XREGFLAG(n,s,c) ((!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0)
1064 static const char *comp_urlmap(cmd_parms *cmd, urlmap *newmap,
1065 const char *from, const char *to,
1066 const char *flags, const char *cond)
1068 const char *err = NULL;
1070 = XREGFLAG(M_HTML,flags,'h')
1071 | XREGFLAG(M_EVENTS,flags,'e')
1072 | XREGFLAG(M_CDATA,flags,'c')
1073 | REGFLAG(M_ATSTART,flags,'^')
1074 | REGFLAG(M_ATEND,flags,'$')
1075 | REGFLAG(M_REGEX,flags,'R')
1076 | REGFLAG(M_LAST,flags,'L')
1077 | REGFLAG(M_NOTLAST,flags,'l')
1078 | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1079 | REGFLAG(M_INTERPOLATE_FROM,flags,'v');
1081 if ((newmap->flags & M_INTERPOLATE_FROM) || !(newmap->flags & M_REGEX)) {
1082 newmap->from.c = from;
1087 = REGFLAG(AP_REG_EXTENDED,flags,'x')
1088 | REGFLAG(AP_REG_ICASE,flags,'i')
1089 | REGFLAG(AP_REG_NOSUB,flags,'n')
1090 | REGFLAG(AP_REG_NEWLINE,flags,'s');
1091 newmap->from.r = ap_pregcomp(cmd->pool, from, newmap->regflags);
1095 /* back-compatibility: support old-style ENV expressions
1096 * by converting to ap_expr syntax.
1098 * 1. var --> env(var)
1099 * 2. var=val --> env(var)=val
1100 * 3. !var --> !env(var)
1101 * 4. !var=val --> env(var)!=val
1103 char *newcond = NULL;
1104 if (ap_rxplus_exec(cmd->temp_pool, old_expr, cond, &newcond)) {
1105 /* we got a substitution. Check for the case (3) above
1106 * that the regexp gets wrong: a negation without a comparison.
1108 if ((cond[0] == '!') && !ap_strchr_c(cond, '=')) {
1109 memmove(newcond+1, newcond, strlen(newcond)-1);
1114 newmap->cond = ap_expr_parse_cmd(cmd, cond, 0, &err, NULL);
1117 newmap->cond = NULL;
1122 static const char *set_urlmap(cmd_parms *cmd, void *CFG, const char *args)
1124 proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1126 apr_pool_t *pool = cmd->pool;
1129 "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1133 const char *cond = NULL;
1135 if (from = ap_getword_conf(cmd->pool, &args), !from)
1137 if (to = ap_getword_conf(cmd->pool, &args), !to)
1139 flags = ap_getword_conf(cmd->pool, &args);
1140 if (flags && *flags)
1141 cond = ap_getword_conf(cmd->pool, &args);
1145 /* the args look OK, so let's use them */
1146 newmap = apr_palloc(pool, sizeof(urlmap));
1147 newmap->next = NULL;
1149 for (map = cfg->map; map->next; map = map->next);
1155 return comp_urlmap(cmd, newmap, from, to, flags, cond);
1158 static const char *set_doctype(cmd_parms *cmd, void *CFG,
1159 const char *t, const char *l)
1161 proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1162 if (!strcasecmp(t, "auto")) {
1163 cfg->doctype = DEFAULT_DOCTYPE; /* activates pinternalSubset */
1165 else if (!strcasecmp(t, "xhtml")) {
1166 cfg->etag = xhtml_etag;
1167 if (l && !strcasecmp(l, "legacy"))
1168 cfg->doctype = fpi_xhtml_legacy;
1170 cfg->doctype = fpi_xhtml;
1172 else if (!strcasecmp(t, "html")) {
1173 cfg->etag = html_etag;
1174 if (l && !strcasecmp(l, "legacy"))
1175 cfg->doctype = fpi_html_legacy;
1177 cfg->doctype = fpi_html;
1179 else if (!strcasecmp(t, "html5")) {
1180 cfg->etag = html_etag;
1181 cfg->doctype = fpi_html5;
1184 cfg->doctype = apr_pstrdup(cmd->pool, t);
1185 if (l && ((l[0] == 'x') || (l[0] == 'X')))
1186 cfg->etag = xhtml_etag;
1188 cfg->etag = html_etag;
1193 static const char *set_flags(cmd_parms *cmd, void *CFG, const char *arg)
1195 proxy_html_conf *cfg = CFG;
1197 if (!strcasecmp(arg, "lowercase"))
1198 cfg->flags |= NORM_LC;
1199 else if (!strcasecmp(arg, "dospath"))
1200 cfg->flags |= NORM_MSSLASH;
1201 else if (!strcasecmp(arg, "reset"))
1202 cfg->flags |= NORM_RESET;
1207 static const char *set_events(cmd_parms *cmd, void *CFG, const char *arg)
1210 proxy_html_conf *cfg = CFG;
1211 if (cfg->events == NULL)
1212 cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1213 attr = apr_array_push(cfg->events);
1218 static const char *set_links(cmd_parms *cmd, void *CFG,
1219 const char *elt, const char *att)
1221 apr_array_header_t *attrs;
1223 proxy_html_conf *cfg = CFG;
1225 if (cfg->links == NULL)
1226 cfg->links = apr_hash_make(cmd->pool);
1228 attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING);
1230 attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*));
1231 apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs);
1233 attr = apr_array_push(attrs);
1237 static const command_rec proxy_html_cmds[] = {
1238 AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1239 RSRC_CONF|ACCESS_CONF,
1240 "Strings to be treated as scripting events"),
1241 AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1242 RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1243 AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1244 RSRC_CONF|ACCESS_CONF, "Map URL From To"),
1245 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1246 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]"),
1247 AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1248 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath"),
1249 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1250 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1251 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements"),
1252 AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1253 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1254 RSRC_CONF|ACCESS_CONF,
1255 "Support interpolation and conditions in URLMaps"),
1256 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1257 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1258 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS"),
1259 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1260 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1261 RSRC_CONF|ACCESS_CONF, "Strip out comments"),
1262 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1263 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1264 RSRC_CONF|ACCESS_CONF, "Buffer size"),
1265 AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1266 (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1267 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset"),
1268 AP_INIT_FLAG("ProxyHTMLEnable", ap_set_flag_slot,
1269 (void*)APR_OFFSETOF(proxy_html_conf, enabled),
1270 RSRC_CONF|ACCESS_CONF,
1271 "Enable proxy-html and xml2enc filters"),
1274 static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2)
1276 seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1277 AP_REG_EXTENDED|AP_REG_ICASE);
1278 seek_content = apr_strmatch_precompile(p, "content", 0);
1279 memset(&sax, 0, sizeof(htmlSAXHandler));
1280 sax.startElement = pstartElement;
1281 sax.endElement = pendElement;
1282 sax.characters = pcharacters;
1283 sax.comment = pcomment;
1284 sax.cdataBlock = pcdata;
1285 sax.internalSubset = pinternalSubset;
1286 xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
1287 xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
1288 if (!xml2enc_charset) {
1289 ap_log_perror(APLOG_MARK, APLOG_NOTICE, 0, p2, APLOGNO(01425)
1290 "I18n support in mod_proxy_html requires mod_xml2enc. "
1291 "Without it, non-ASCII characters in proxied pages are "
1292 "likely to display incorrectly.");
1295 /* old_expr only needs to last the life of the config phase */
1296 old_expr = ap_rxplus_compile(p1, "s/^(!)?(\\w+)((=)(.+))?$/reqenv('$2')$1$4'$5'/");
1299 static void proxy_html_insert(request_rec *r)
1301 proxy_html_conf *cfg;
1302 cfg = ap_get_module_config(r->per_dir_config, &proxy_html_module);
1305 xml2enc_filter(r, NULL, ENCIO_INPUT_CHECKS);
1306 ap_add_output_filter("proxy-html", NULL, r, r->connection);
1309 static void proxy_html_hooks(apr_pool_t *p)
1311 static const char *aszSucc[] = { "mod_filter.c", NULL };
1312 ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1313 NULL, AP_FTYPE_RESOURCE,
1314 AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH);
1315 /* move this to pre_config so old_expr is available to interpret
1316 * old-style conditions on URL maps.
1318 ap_hook_pre_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE);
1319 ap_hook_insert_filter(proxy_html_insert, NULL, aszSucc, APR_HOOK_MIDDLE);
1322 AP_DECLARE_MODULE(proxy_html) = {
1323 STANDARD20_MODULE_STUFF,