1 /* Copyright (c) 2003-11, WebThing Ltd
2 * Copyright (c) 2011-, The Apache Software Foundation
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 You can #define GO_FASTER to disable trace logging.
28 #define VERBOSE(x) if (verbose) x
29 #define VERBOSEB(x) if (verbose) {x}
33 #include <libxml/HTMLparser.h>
35 #include "http_protocol.h"
36 #include "http_config.h"
38 #include "apr_strings.h"
40 #include "apr_strmatch.h"
43 #include "apr_optional.h"
44 #include "mod_xml2enc.h"
45 #include "http_request.h"
48 /* globals set once at startup */
49 static ap_rxplus_t *old_expr;
50 static ap_regex_t *seek_meta;
51 static const apr_strmatch_pattern* seek_content;
52 static apr_status_t (*xml2enc_charset)(request_rec*, xmlCharEncoding*, const char**) = NULL;
53 static apr_status_t (*xml2enc_filter)(request_rec*, const char*, unsigned int) = NULL;
55 module AP_MODULE_DECLARE_DATA proxy_html_module;
61 #define M_ATSTART 0x10
64 #define M_NOTLAST 0x80
65 #define M_INTERPOLATE_TO 0x100
66 #define M_INTERPOLATE_FROM 0x200
75 typedef struct urlmap {
78 unsigned int regflags;
93 apr_array_header_t *events;
94 const char *charset_out;
103 proxy_html_conf *cfg;
104 htmlParserCtxtPtr parser;
105 apr_bucket_brigade *bb;
109 const char *encoding;
115 #define NORM_MSSLASH 0x2
116 #define NORM_RESET 0x4
117 static htmlSAXHandler sax;
119 typedef enum { ATTR_IGNORE, ATTR_URI, ATTR_EVENT } rewrite_t;
121 static const char *const fpi_html =
122 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01//EN\">\n";
123 static const char *const fpi_html_legacy =
124 "<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\">\n";
125 static const char *const fpi_xhtml =
126 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n";
127 static const char *const fpi_xhtml_legacy =
128 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n";
129 static const char *const fpi_html5 = "<!DOCTYPE html>\n";
130 static const char *const html_etag = ">";
131 static const char *const xhtml_etag = " />";
132 /*#define DEFAULT_DOCTYPE fpi_html */
133 static const char *const DEFAULT_DOCTYPE = "";
134 #define DEFAULT_ETAG html_etag
136 static void normalise(unsigned int flags, char *str)
140 for (p = str; *p; ++p)
144 if (flags & NORM_MSSLASH)
145 for (p = ap_strchr(str, '\\'); p; p = ap_strchr(p+1, '\\'))
149 #define consume_buffer(ctx,inbuf,bytes,flag) \
150 htmlParseChunk(ctx->parser, inbuf, bytes, flag)
152 #define AP_fwrite(ctx,inbuf,bytes,flush) \
153 ap_fwrite(ctx->f->next, ctx->bb, inbuf, bytes);
155 /* This is always utf-8 on entry. We can convert charset within FLUSH */
156 #define FLUSH AP_fwrite(ctx, (chars+begin), (i-begin), 0); begin = i+1
157 static void pcharacters(void *ctxt, const xmlChar *uchars, int length)
159 const char *chars = (const char*) uchars;
160 saxctxt *ctx = (saxctxt*) ctxt;
163 for (begin=i=0; i<length; i++) {
165 case '&' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "&"); break;
166 case '<' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, "<"); break;
167 case '>' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, ">"); break;
168 case '"' : FLUSH; ap_fputs(ctx->f->next, ctx->bb, """); break;
175 static void preserve(saxctxt *ctx, const size_t len)
178 if (len <= (ctx->avail - ctx->offset))
180 else while (len > (ctx->avail - ctx->offset))
181 ctx->avail += ctx->cfg->bufsz;
183 newbuf = realloc(ctx->buf, ctx->avail);
184 if (newbuf != ctx->buf) {
186 apr_pool_cleanup_kill(ctx->f->r->pool, ctx->buf,
187 (int(*)(void*))free);
188 apr_pool_cleanup_register(ctx->f->r->pool, newbuf,
189 (int(*)(void*))free, apr_pool_cleanup_null);
194 static void pappend(saxctxt *ctx, const char *buf, const size_t len)
197 memcpy(ctx->buf+ctx->offset, buf, len);
201 static void dump_content(saxctxt *ctx)
209 ap_regmatch_t pmatch[10];
212 urlmap *themap = ctx->map;
214 int verbose = APLOGrtrace1(ctx->f->r);
217 pappend(ctx, &c, 1); /* append null byte */
218 /* parse the text for URLs */
219 for (m = themap; m; m = m->next) {
220 if (!(m->flags & M_CDATA))
222 if (m->flags & M_REGEX) {
225 while (!ap_regexec(m->from.r, ctx->buf+offs, nmatch, pmatch, 0)) {
226 match = pmatch[0].rm_so;
227 s_from = pmatch[0].rm_eo - match;
228 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
231 len = strlen(ctx->buf);
234 const char *f = apr_pstrndup(ctx->f->r->pool,
235 ctx->buf + offs, s_from);
236 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
237 "C/RX: match at %s, substituting %s", f, subs);
240 preserve(ctx, s_to - s_from);
241 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
242 len + 1 - s_from - offs);
243 memcpy(ctx->buf+offs, subs, s_to);
246 memcpy(ctx->buf + offs, subs, s_to);
247 memmove(ctx->buf+offs+s_to, ctx->buf+offs+s_from,
248 len + 1 - s_from - offs);
254 s_from = strlen(m->from.c);
255 s_to = strlen(m->to);
256 for (found = strstr(ctx->buf, m->from.c); found;
257 found = strstr(ctx->buf+match+s_to, m->from.c)) {
258 match = found - ctx->buf;
259 if ((m->flags & M_ATSTART) && (match != 0))
261 len = strlen(ctx->buf);
262 if ((m->flags & M_ATEND) && (match < (len - s_from)))
264 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0, ctx->f->r,
265 "C: matched %s, substituting %s",
268 preserve(ctx, s_to - s_from);
269 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
270 len + 1 - s_from - match);
271 memcpy(ctx->buf+match, m->to, s_to);
274 memcpy(ctx->buf+match, m->to, s_to);
275 memmove(ctx->buf+match+s_to, ctx->buf+match+s_from,
276 len + 1 - s_from - match);
281 AP_fwrite(ctx, ctx->buf, strlen(ctx->buf), 1);
283 static void pcdata(void *ctxt, const xmlChar *uchars, int length)
285 const char *chars = (const char*) uchars;
286 saxctxt *ctx = (saxctxt*) ctxt;
287 if (ctx->cfg->extfix) {
288 pappend(ctx, chars, length);
291 /* not sure if this should force-flush
292 * (i.e. can one cdata section come in multiple calls?)
294 AP_fwrite(ctx, chars, length, 0);
297 static void pcomment(void *ctxt, const xmlChar *uchars)
299 const char *chars = (const char*) uchars;
300 saxctxt *ctx = (saxctxt*) ctxt;
301 if (ctx->cfg->strip_comments)
304 if (ctx->cfg->extfix) {
305 pappend(ctx, "<!--", 4);
306 pappend(ctx, chars, strlen(chars));
307 pappend(ctx, "-->", 3);
310 ap_fputs(ctx->f->next, ctx->bb, "<!--");
311 AP_fwrite(ctx, chars, strlen(chars), 1);
312 ap_fputs(ctx->f->next, ctx->bb, "-->");
316 static void pendElement(void *ctxt, const xmlChar *uname)
318 saxctxt *ctx = (saxctxt*) ctxt;
319 const char *name = (const char*) uname;
320 const htmlElemDesc* desc = htmlTagLookup(uname);
322 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
324 if (!desc || desc->depr)
328 else if ((ctx->cfg->doctype == fpi_html_legacy)
329 || (ctx->cfg->doctype == fpi_xhtml_legacy)) {
330 /* enforce html legacy */
334 /* TODO - implement HTML "allowed here" using the stack */
335 /* nah. Keeping the stack is too much overhead */
337 if (ctx->offset > 0) {
339 ctx->offset = 0; /* having dumped it, we can re-use the memory */
341 if (!desc || !desc->empty) {
342 ap_fprintf(ctx->f->next, ctx->bb, "</%s>", name);
346 static void pstartElement(void *ctxt, const xmlChar *uname,
347 const xmlChar** uattrs)
356 size_t s_to, s_from, match;
358 saxctxt *ctx = (saxctxt*) ctxt;
360 ap_regmatch_t pmatch[10];
362 int verbose = APLOGrtrace1(ctx->f->r);
364 apr_array_header_t *linkattrs;
366 const char *name = (const char*) uname;
367 const char** attrs = (const char**) uattrs;
368 const htmlElemDesc* desc = htmlTagLookup(uname);
369 urlmap *themap = ctx->map;
374 if ((ctx->cfg->doctype == fpi_html) || (ctx->cfg->doctype == fpi_xhtml)) {
377 if (!desc || desc->depr)
381 else if ((ctx->cfg->doctype == fpi_html)
382 || (ctx->cfg->doctype == fpi_xhtml)) {
384 /* enforce html legacy */
389 if (!desc && enforce) {
390 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01416)
391 "Bogus HTML element %s dropped", name);
394 if (desc && desc->depr && (enforce == 2)) {
395 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01417)
396 "Deprecated HTML element %s dropped", name);
400 descp = apr_array_push(ctx->stack);
402 /* TODO - implement HTML "allowed here" */
405 ap_fputc(ctx->f->next, ctx->bb, '<');
406 ap_fputs(ctx->f->next, ctx->bb, name);
409 if ((enforce > 0) && (desc != NULL) && (desc->attrs_req != NULL))
410 for (a = desc->attrs_req; *a; a++)
414 linkattrs = apr_hash_get(ctx->cfg->links, name, APR_HASH_KEY_STRING);
415 for (a = attrs; *a; a += 2) {
416 if (desc && enforce > 0) {
417 switch (htmlAttrAllowed(desc, (xmlChar*)*a, 2-enforce)) {
419 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01418)
420 "Bogus HTML attribute %s of %s dropped",
423 case HTML_DEPRECATED:
424 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01419)
425 "Deprecated HTML attribute %s of %s dropped",
429 required_attrs--; /* cross off the number still needed */
430 /* fallthrough - required implies valid */
437 pappend(ctx, a[1], strlen(a[1])+1);
438 is_uri = ATTR_IGNORE;
440 tattr *attrs = (tattr*) linkattrs->elts;
441 for (i=0; i < linkattrs->nelts; ++i) {
442 if (!strcmp(*a, attrs[i].val)) {
448 if ((is_uri == ATTR_IGNORE) && ctx->cfg->extfix
449 && (ctx->cfg->events != NULL)) {
450 for (i=0; i < ctx->cfg->events->nelts; ++i) {
451 tattr *attrs = (tattr*) ctx->cfg->events->elts;
452 if (!strcmp(*a, attrs[i].val)) {
461 for (m = themap; m; m = m->next) {
462 if (!(m->flags & M_HTML))
464 if (m->flags & M_REGEX) {
466 if (!ap_regexec(m->from.r, ctx->buf, nmatch,
469 offs = match = pmatch[0].rm_so;
470 s_from = pmatch[0].rm_eo - match;
471 subs = ap_pregsub(ctx->f->r->pool, m->to,
472 ctx->buf, nmatch, pmatch);
475 f = apr_pstrndup(ctx->f->r->pool,
476 ctx->buf + offs, s_from);
477 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
479 "H/RX: match at %s, substituting %s",
483 len = strlen(ctx->buf);
485 preserve(ctx, s_to - s_from);
486 memmove(ctx->buf+offs+s_to,
487 ctx->buf+offs+s_from,
488 len + 1 - s_from - offs);
489 memcpy(ctx->buf+offs, subs, s_to);
492 memcpy(ctx->buf + offs, subs, s_to);
493 memmove(ctx->buf+offs+s_to,
494 ctx->buf+offs+s_from,
495 len + 1 - s_from - offs);
499 s_from = strlen(m->from.c);
500 if (!strncasecmp(ctx->buf, m->from.c, s_from)) {
502 s_to = strlen(m->to);
503 len = strlen(ctx->buf);
504 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
506 "H: matched %s, substituting %s",
509 preserve(ctx, s_to - s_from);
510 memmove(ctx->buf+s_to, ctx->buf+s_from,
512 memcpy(ctx->buf, m->to, s_to);
514 else { /* it fits in the existing space */
515 memcpy(ctx->buf, m->to, s_to);
516 memmove(ctx->buf+s_to, ctx->buf+s_from,
522 /* URIs only want one match unless overridden in the config */
523 if ((num_match > 0) && !(m->flags & M_NOTLAST))
528 for (m = themap; m; m = m->next) {
529 num_match = 0; /* reset here since we're working per-rule */
530 if (!(m->flags & M_EVENTS))
532 if (m->flags & M_REGEX) {
535 while (!ap_regexec(m->from.r, ctx->buf+offs,
536 nmatch, pmatch, 0)) {
537 match = pmatch[0].rm_so;
538 s_from = pmatch[0].rm_eo - match;
539 subs = ap_pregsub(ctx->f->r->pool, m->to, ctx->buf+offs,
543 f = apr_pstrndup(ctx->f->r->pool,
544 ctx->buf + offs, s_from);
545 ap_log_rerror(APLOG_MARK, APLOG_TRACE3, 0,
547 "E/RX: match at %s, substituting %s",
552 len = strlen(ctx->buf);
554 preserve(ctx, s_to - s_from);
555 memmove(ctx->buf+offs+s_to,
556 ctx->buf+offs+s_from,
557 len + 1 - s_from - offs);
558 memcpy(ctx->buf+offs, subs, s_to);
561 memcpy(ctx->buf + offs, subs, s_to);
562 memmove(ctx->buf+offs+s_to,
563 ctx->buf+offs+s_from,
564 len + 1 - s_from - offs);
571 found = strstr(ctx->buf, m->from.c);
572 if ((m->flags & M_ATSTART) && (found != ctx->buf))
575 s_from = strlen(m->from.c);
576 s_to = strlen(m->to);
577 match = found - ctx->buf;
578 if ((s_from < strlen(found))
579 && (m->flags & M_ATEND)) {
580 found = strstr(ctx->buf+match+s_from,
585 found = strstr(ctx->buf+match+s_to,
588 VERBOSE(ap_log_rerror(APLOG_MARK, APLOG_TRACE3,
590 "E: matched %s, substituting %s",
592 len = strlen(ctx->buf);
594 preserve(ctx, s_to - s_from);
595 memmove(ctx->buf+match+s_to,
596 ctx->buf+match+s_from,
597 len + 1 - s_from - match);
598 memcpy(ctx->buf+match, m->to, s_to);
601 memcpy(ctx->buf+match, m->to, s_to);
602 memmove(ctx->buf+match+s_to,
603 ctx->buf+match+s_from,
604 len + 1 - s_from - match);
609 if (num_match && (m->flags & M_LAST))
618 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], NULL);
621 if (ctx->cfg->flags != 0)
622 normalise(ctx->cfg->flags, ctx->buf);
624 /* write the attribute, using pcharacters to html-escape
625 anything that needs it in the value.
627 ap_fputstrs(ctx->f->next, ctx->bb, " ", a[0], "=\"", NULL);
628 pcharacters(ctx, (const xmlChar*)ctx->buf, strlen(ctx->buf));
629 ap_fputc(ctx->f->next, ctx->bb, '"');
634 if (desc && desc->empty)
635 ap_fputs(ctx->f->next, ctx->bb, ctx->cfg->etag);
637 ap_fputc(ctx->f->next, ctx->bb, '>');
639 if ((enforce > 0) && (required_attrs > 0)) {
640 /* if there are more required attributes than we found then complain */
641 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, ctx->f->r, APLOGNO(01420)
642 "HTML element %s is missing %d required attributes",
643 name, required_attrs);
647 static meta *metafix(request_rec *r, const char *buf)
655 ap_regmatch_t pmatch[2];
658 while (!ap_regexec(seek_meta, buf+offs, 2, pmatch, 0)) {
661 p = buf+offs+pmatch[1].rm_eo;
662 while (!apr_isalpha(*++p));
663 for (q = p; apr_isalnum(*q) || (*q == '-'); ++q);
664 header = apr_pstrndup(r->pool, p, q-p);
665 if (strncasecmp(header, "Content-", 8)) {
666 /* find content=... string */
667 p = apr_strmatch(seek_content, buf+offs+pmatch[0].rm_so,
668 pmatch[0].rm_eo - pmatch[0].rm_so);
669 /* if it doesn't contain "content", ignore, don't crash! */
673 while (apr_isspace(*p))
675 /* XXX Should we search for another content= pattern? */
678 while (*p && apr_isspace(*++p));
679 if ((*p == '\'') || (*p == '"')) {
681 for (q = p; *q && *q != delim; ++q);
682 /* No terminating delimiter found? Skip the boggus directive */
686 for (q = p; *q && !apr_isspace(*q) && (*q != '>'); ++q);
688 content = apr_pstrndup(r->pool, p, q-p);
693 else if (!strncasecmp(header, "Content-Type", 12)) {
694 ret = apr_palloc(r->pool, sizeof(meta));
695 ret->start = offs+pmatch[0].rm_so;
696 ret->end = offs+pmatch[0].rm_eo;
698 if (header && content) {
700 ap_log_rerror(APLOG_MARK, APLOG_TRACE2, 0, r,
701 "Adding header [%s: %s] from HTML META",
704 apr_table_setn(r->headers_out, header, content);
706 offs += pmatch[0].rm_eo;
711 static const char *interpolate_vars(request_rec *r, const char *str)
718 const char *replacement;
722 if (start = ap_strstr_c(start, "${"), start == NULL)
725 if (end = ap_strchr_c(start+2, '}'), end == NULL)
728 delim = ap_strchr_c(start, '|');
729 before = apr_pstrndup(r->pool, str, start-str);
732 var = apr_pstrndup(r->pool, start+2, delim-start-2);
735 var = apr_pstrndup(r->pool, start+2, end-start-2);
737 replacement = apr_table_get(r->subprocess_env, var);
740 replacement = apr_pstrndup(r->pool, delim+1, end-delim-1);
744 str = apr_pstrcat(r->pool, before, replacement, after, NULL);
745 ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, r,
746 "Interpolating %s => %s", var, replacement);
750 static void fixup_rules(saxctxt *ctx)
755 request_rec *r = ctx->f->r;
757 for (p = ctx->cfg->map; p; p = p->next) {
758 if (p->cond != NULL) {
760 int ok = ap_expr_exec(r, p->cond, &err);
762 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01421)
763 "Error evaluating expr: %s", err);
766 continue; /* condition is unsatisfied */
770 newp = apr_pmemdup(r->pool, p, sizeof(urlmap));
772 if (newp->flags & M_INTERPOLATE_FROM) {
773 newp->from.c = interpolate_vars(r, newp->from.c);
774 if (!newp->from.c || !*newp->from.c)
775 continue; /* don't use empty from-pattern */
776 if (newp->flags & M_REGEX) {
777 newp->from.r = ap_pregcomp(r->pool, newp->from.c,
781 if (newp->flags & M_INTERPOLATE_TO) {
782 newp->to = interpolate_vars(r, newp->to);
784 /* evaluate p->cond; continue if unsatisfied */
785 /* create new urlmap with memcpy and append to map */
786 /* interpolate from if flagged to do so */
787 /* interpolate to if flagged to do so */
800 static saxctxt *check_filter_init (ap_filter_t *f)
804 proxy_html_conf *cfg;
806 const char *errmsg = NULL;
807 cfg = ap_get_module_config(f->r->per_dir_config, &proxy_html_module);
808 force = apr_table_get(f->r->subprocess_env, "PROXY_HTML_FORCE");
811 if (!f->r->proxyreq) {
812 errmsg = "Non-proxy request; not inserting proxy-html filter";
814 else if (!f->r->content_type) {
815 errmsg = "No content-type; bailing out of proxy-html filter";
817 else if (strncasecmp(f->r->content_type, "text/html", 9) &&
818 strncasecmp(f->r->content_type,
819 "application/xhtml+xml", 21)) {
820 errmsg = "Non-HTML content; not inserting proxy-html filter";
824 errmsg = "No links configured: nothing for proxy-html filter to do";
829 ap_log_rerror(APLOG_MARK, APLOG_TRACE1, 0, f->r, "%s", errmsg);
831 ap_remove_output_filter(f);
835 fctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(saxctxt));
837 fctx->bb = apr_brigade_create(f->r->pool,
838 f->r->connection->bucket_alloc);
840 apr_table_unset(f->r->headers_out, "Content-Length");
845 fctx->map = cfg->map;
846 /* defer dealing with charset_out until after sniffing charset_in
847 * so we can support setting one to t'other.
853 static apr_status_t proxy_html_filter(ap_filter_t *f, apr_bucket_brigade *bb)
859 apr_size_t bytes = 0;
860 #ifndef USE_OLD_LIBXML2
861 int xmlopts = XML_PARSE_RECOVER | XML_PARSE_NONET |
862 XML_PARSE_NOBLANKS | XML_PARSE_NOERROR | XML_PARSE_NOWARNING;
865 saxctxt *ctxt = check_filter_init(f);
867 return ap_pass_brigade(f->next, bb);
868 for (b = APR_BRIGADE_FIRST(bb);
869 b != APR_BRIGADE_SENTINEL(bb);
870 b = APR_BUCKET_NEXT(b)) {
871 if (APR_BUCKET_IS_METADATA(b)) {
872 if (APR_BUCKET_IS_EOS(b)) {
873 if (ctxt->parser != NULL) {
874 consume_buffer(ctxt, buf, 0, 1);
876 APR_BRIGADE_INSERT_TAIL(ctxt->bb,
877 apr_bucket_eos_create(ctxt->bb->bucket_alloc));
878 ap_pass_brigade(ctxt->f->next, ctxt->bb);
880 else if (APR_BUCKET_IS_FLUSH(b)) {
881 /* pass on flush, except at start where it would cause
882 * headers to be sent before doc sniffing
884 if (ctxt->parser != NULL) {
885 ap_fflush(ctxt->f->next, ctxt->bb);
889 else if (apr_bucket_read(b, &buf, &bytes, APR_BLOCK_READ)
891 if (ctxt->parser == NULL) {
893 if (!xml2enc_charset ||
894 (xml2enc_charset(f->r, &enc, &cenc) != APR_SUCCESS)) {
895 if (!xml2enc_charset)
896 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01422)
897 "No i18n support found. Install mod_xml2enc if required");
898 enc = XML_CHAR_ENCODING_NONE;
899 ap_set_content_type(f->r, "text/html;charset=utf-8");
902 /* if we wanted a non-default charset_out, insert the
903 * xml2enc filter now that we've sniffed it
905 if (ctxt->cfg->charset_out && xml2enc_filter) {
906 if (*ctxt->cfg->charset_out != '*')
907 cenc = ctxt->cfg->charset_out;
908 xml2enc_filter(f->r, cenc, ENCIO_OUTPUT);
909 ap_set_content_type(f->r,
910 apr_pstrcat(f->r->pool,
911 "text/html;charset=",
914 else /* Normal case, everything worked, utf-8 output */
915 ap_set_content_type(f->r, "text/html;charset=utf-8");
918 ap_fputs(f->next, ctxt->bb, ctxt->cfg->doctype);
919 ctxt->parser = htmlCreatePushParserCtxt(&sax, ctxt, buf,
923 if (ctxt->parser == NULL) {
924 apr_status_t rv = ap_pass_brigade(f->next, bb);
925 ap_remove_output_filter(f);
928 apr_pool_cleanup_register(f->r->pool, ctxt->parser,
929 (int(*)(void*))htmlFreeParserCtxt,
930 apr_pool_cleanup_null);
931 #ifndef USE_OLD_LIBXML2
932 if (xmlopts = xmlCtxtUseOptions(ctxt->parser, xmlopts), xmlopts)
933 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, f->r, APLOGNO(01423)
934 "Unsupported parser opts %x", xmlopts);
936 if (ctxt->cfg->metafix)
937 m = metafix(f->r, buf);
939 consume_buffer(ctxt, buf, m->start, 0);
940 consume_buffer(ctxt, buf+m->end, bytes-m->end, 0);
943 consume_buffer(ctxt, buf, bytes, 0);
947 consume_buffer(ctxt, buf, bytes, 0);
951 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01424)
952 "Error in bucket read");
955 /*ap_fflush(ctxt->f->next, ctxt->bb); // uncomment for debug */
956 apr_brigade_cleanup(bb);
960 static void *proxy_html_config(apr_pool_t *pool, char *x)
962 proxy_html_conf *ret = apr_pcalloc(pool, sizeof(proxy_html_conf));
963 ret->doctype = DEFAULT_DOCTYPE;
964 ret->etag = DEFAULT_ETAG;
966 /* ret->interp = 1; */
967 /* don't initialise links and events until they get set/used */
971 static void *proxy_html_merge(apr_pool_t *pool, void *BASE, void *ADD)
973 proxy_html_conf *base = (proxy_html_conf *) BASE;
974 proxy_html_conf *add = (proxy_html_conf *) ADD;
975 proxy_html_conf *conf = apr_palloc(pool, sizeof(proxy_html_conf));
977 /* don't merge declarations - just use the most specific */
978 conf->links = (add->links == NULL) ? base->links : add->links;
979 conf->events = (add->events == NULL) ? base->events : add->events;
981 conf->charset_out = (add->charset_out == NULL)
982 ? base->charset_out : add->charset_out;
984 if (add->map && base->map) {
987 for (a = base->map; a; a = a->next) {
988 urlmap *save = conf->map;
989 conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
990 conf->map->next = save;
992 for (a = add->map; a; a = a->next) {
993 urlmap *save = conf->map;
994 conf->map = apr_pmemdup(pool, a, sizeof(urlmap));
995 conf->map->next = save;
999 conf->map = add->map ? add->map : base->map;
1001 conf->doctype = (add->doctype == DEFAULT_DOCTYPE)
1002 ? base->doctype : add->doctype;
1003 conf->etag = (add->etag == DEFAULT_ETAG) ? base->etag : add->etag;
1004 conf->bufsz = add->bufsz;
1005 if (add->flags & NORM_RESET) {
1006 conf->flags = add->flags ^ NORM_RESET;
1007 conf->metafix = add->metafix;
1008 conf->extfix = add->extfix;
1009 conf->interp = add->interp;
1010 conf->strip_comments = add->strip_comments;
1011 conf->enabled = add->enabled;
1014 conf->flags = base->flags | add->flags;
1015 conf->metafix = base->metafix | add->metafix;
1016 conf->extfix = base->extfix | add->extfix;
1017 conf->interp = base->interp | add->interp;
1018 conf->strip_comments = base->strip_comments | add->strip_comments;
1019 conf->enabled = add->enabled | base->enabled;
1023 #define REGFLAG(n,s,c) ((s&&(ap_strchr_c((s),(c))!=NULL)) ? (n) : 0)
1024 #define XREGFLAG(n,s,c) ((!s||(ap_strchr_c((s),(c))==NULL)) ? (n) : 0)
1025 static const char *comp_urlmap(cmd_parms *cmd, urlmap *newmap,
1026 const char *from, const char *to,
1027 const char *flags, const char *cond)
1029 const char *err = NULL;
1031 = XREGFLAG(M_HTML,flags,'h')
1032 | XREGFLAG(M_EVENTS,flags,'e')
1033 | XREGFLAG(M_CDATA,flags,'c')
1034 | REGFLAG(M_ATSTART,flags,'^')
1035 | REGFLAG(M_ATEND,flags,'$')
1036 | REGFLAG(M_REGEX,flags,'R')
1037 | REGFLAG(M_LAST,flags,'L')
1038 | REGFLAG(M_NOTLAST,flags,'l')
1039 | REGFLAG(M_INTERPOLATE_TO,flags,'V')
1040 | REGFLAG(M_INTERPOLATE_FROM,flags,'v');
1042 if ((newmap->flags & M_INTERPOLATE_FROM) || !(newmap->flags & M_REGEX)) {
1043 newmap->from.c = from;
1048 = REGFLAG(AP_REG_EXTENDED,flags,'x')
1049 | REGFLAG(AP_REG_ICASE,flags,'i')
1050 | REGFLAG(AP_REG_NOSUB,flags,'n')
1051 | REGFLAG(AP_REG_NEWLINE,flags,'s');
1052 newmap->from.r = ap_pregcomp(cmd->pool, from, newmap->regflags);
1056 /* back-compatibility: support old-style ENV expressions
1057 * by converting to ap_expr syntax.
1059 * 1. var --> env(var)
1060 * 2. var=val --> env(var)=val
1061 * 3. !var --> !env(var)
1062 * 4. !var=val --> env(var)!=val
1064 char *newcond = NULL;
1065 if (ap_rxplus_exec(cmd->temp_pool, old_expr, cond, &newcond)) {
1066 /* we got a substitution. Check for the case (3) above
1067 * that the regexp gets wrong: a negation without a comparison.
1069 if ((cond[0] == '!') && !ap_strchr_c(cond, '=')) {
1070 memmove(newcond+1, newcond, strlen(newcond)-1);
1075 newmap->cond = ap_expr_parse_cmd(cmd, cond, 0, &err, NULL);
1078 newmap->cond = NULL;
1083 static const char *set_urlmap(cmd_parms *cmd, void *CFG, const char *args)
1085 proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1087 apr_pool_t *pool = cmd->pool;
1090 "Usage: ProxyHTMLURLMap from-pattern to-pattern [flags] [cond]";
1094 const char *cond = NULL;
1096 if (from = ap_getword_conf(cmd->pool, &args), !from)
1098 if (to = ap_getword_conf(cmd->pool, &args), !to)
1100 flags = ap_getword_conf(cmd->pool, &args);
1101 if (flags && *flags)
1102 cond = ap_getword_conf(cmd->pool, &args);
1106 /* the args look OK, so let's use them */
1107 newmap = apr_palloc(pool, sizeof(urlmap));
1108 newmap->next = NULL;
1110 for (map = cfg->map; map->next; map = map->next);
1116 return comp_urlmap(cmd, newmap, from, to, flags, cond);
1119 static const char *set_doctype(cmd_parms *cmd, void *CFG,
1120 const char *t, const char *l)
1122 proxy_html_conf *cfg = (proxy_html_conf *)CFG;
1123 if (!strcasecmp(t, "xhtml")) {
1124 cfg->etag = xhtml_etag;
1125 if (l && !strcasecmp(l, "legacy"))
1126 cfg->doctype = fpi_xhtml_legacy;
1128 cfg->doctype = fpi_xhtml;
1130 else if (!strcasecmp(t, "html")) {
1131 cfg->etag = html_etag;
1132 if (l && !strcasecmp(l, "legacy"))
1133 cfg->doctype = fpi_html_legacy;
1135 cfg->doctype = fpi_html;
1137 else if (!strcasecmp(t, "html5")) {
1138 cfg->etag = html_etag;
1139 cfg->doctype = fpi_html5;
1142 cfg->doctype = apr_pstrdup(cmd->pool, t);
1143 if (l && ((l[0] == 'x') || (l[0] == 'X')))
1144 cfg->etag = xhtml_etag;
1146 cfg->etag = html_etag;
1151 static const char *set_flags(cmd_parms *cmd, void *CFG, const char *arg)
1153 proxy_html_conf *cfg = CFG;
1155 if (!strcasecmp(arg, "lowercase"))
1156 cfg->flags |= NORM_LC;
1157 else if (!strcasecmp(arg, "dospath"))
1158 cfg->flags |= NORM_MSSLASH;
1159 else if (!strcasecmp(arg, "reset"))
1160 cfg->flags |= NORM_RESET;
1165 static const char *set_events(cmd_parms *cmd, void *CFG, const char *arg)
1168 proxy_html_conf *cfg = CFG;
1169 if (cfg->events == NULL)
1170 cfg->events = apr_array_make(cmd->pool, 20, sizeof(tattr));
1171 attr = apr_array_push(cfg->events);
1176 static const char *set_links(cmd_parms *cmd, void *CFG,
1177 const char *elt, const char *att)
1179 apr_array_header_t *attrs;
1181 proxy_html_conf *cfg = CFG;
1183 if (cfg->links == NULL)
1184 cfg->links = apr_hash_make(cmd->pool);
1186 attrs = apr_hash_get(cfg->links, elt, APR_HASH_KEY_STRING);
1188 attrs = apr_array_make(cmd->pool, 2, sizeof(tattr*));
1189 apr_hash_set(cfg->links, elt, APR_HASH_KEY_STRING, attrs);
1191 attr = apr_array_push(attrs);
1195 static const command_rec proxy_html_cmds[] = {
1196 AP_INIT_ITERATE("ProxyHTMLEvents", set_events, NULL,
1197 RSRC_CONF|ACCESS_CONF,
1198 "Strings to be treated as scripting events"),
1199 AP_INIT_ITERATE2("ProxyHTMLLinks", set_links, NULL,
1200 RSRC_CONF|ACCESS_CONF, "Declare HTML Attributes"),
1201 AP_INIT_RAW_ARGS("ProxyHTMLURLMap", set_urlmap, NULL,
1202 RSRC_CONF|ACCESS_CONF, "Map URL From To"),
1203 AP_INIT_TAKE12("ProxyHTMLDoctype", set_doctype, NULL,
1204 RSRC_CONF|ACCESS_CONF, "(HTML|XHTML) [Legacy]"),
1205 AP_INIT_ITERATE("ProxyHTMLFixups", set_flags, NULL,
1206 RSRC_CONF|ACCESS_CONF, "Options are lowercase, dospath"),
1207 AP_INIT_FLAG("ProxyHTMLMeta", ap_set_flag_slot,
1208 (void*)APR_OFFSETOF(proxy_html_conf, metafix),
1209 RSRC_CONF|ACCESS_CONF, "Fix META http-equiv elements"),
1210 AP_INIT_FLAG("ProxyHTMLInterp", ap_set_flag_slot,
1211 (void*)APR_OFFSETOF(proxy_html_conf, interp),
1212 RSRC_CONF|ACCESS_CONF,
1213 "Support interpolation and conditions in URLMaps"),
1214 AP_INIT_FLAG("ProxyHTMLExtended", ap_set_flag_slot,
1215 (void*)APR_OFFSETOF(proxy_html_conf, extfix),
1216 RSRC_CONF|ACCESS_CONF, "Map URLs in Javascript and CSS"),
1217 AP_INIT_FLAG("ProxyHTMLStripComments", ap_set_flag_slot,
1218 (void*)APR_OFFSETOF(proxy_html_conf, strip_comments),
1219 RSRC_CONF|ACCESS_CONF, "Strip out comments"),
1220 AP_INIT_TAKE1("ProxyHTMLBufSize", ap_set_int_slot,
1221 (void*)APR_OFFSETOF(proxy_html_conf, bufsz),
1222 RSRC_CONF|ACCESS_CONF, "Buffer size"),
1223 AP_INIT_TAKE1("ProxyHTMLCharsetOut", ap_set_string_slot,
1224 (void*)APR_OFFSETOF(proxy_html_conf, charset_out),
1225 RSRC_CONF|ACCESS_CONF, "Usage: ProxyHTMLCharsetOut charset"),
1226 AP_INIT_FLAG("ProxyHTMLEnable", ap_set_flag_slot,
1227 (void*)APR_OFFSETOF(proxy_html_conf, enabled),
1228 RSRC_CONF|ACCESS_CONF,
1229 "Enable proxy-html and xml2enc filters"),
1232 static int mod_proxy_html(apr_pool_t *p, apr_pool_t *p1, apr_pool_t *p2)
1234 seek_meta = ap_pregcomp(p, "<meta[^>]*(http-equiv)[^>]*>",
1235 AP_REG_EXTENDED|AP_REG_ICASE);
1236 seek_content = apr_strmatch_precompile(p, "content", 0);
1237 memset(&sax, 0, sizeof(htmlSAXHandler));
1238 sax.startElement = pstartElement;
1239 sax.endElement = pendElement;
1240 sax.characters = pcharacters;
1241 sax.comment = pcomment;
1242 sax.cdataBlock = pcdata;
1243 xml2enc_charset = APR_RETRIEVE_OPTIONAL_FN(xml2enc_charset);
1244 xml2enc_filter = APR_RETRIEVE_OPTIONAL_FN(xml2enc_filter);
1245 if (!xml2enc_charset) {
1246 ap_log_perror(APLOG_MARK, APLOG_NOTICE, 0, p2, APLOGNO(01425)
1247 "I18n support in mod_proxy_html requires mod_xml2enc. "
1248 "Without it, non-ASCII characters in proxied pages are "
1249 "likely to display incorrectly.");
1252 /* old_expr only needs to last the life of the config phase */
1253 old_expr = ap_rxplus_compile(p1, "s/^(!)?(\\w+)((=)(.+))?$/reqenv('$2')$1$4'$5'/");
1256 static void proxy_html_insert(request_rec *r)
1258 proxy_html_conf *cfg;
1259 cfg = ap_get_module_config(r->per_dir_config, &proxy_html_module);
1262 xml2enc_filter(r, NULL, ENCIO_INPUT_CHECKS);
1263 ap_add_output_filter("proxy-html", NULL, r, r->connection);
1266 static void proxy_html_hooks(apr_pool_t *p)
1268 static const char *aszSucc[] = { "mod_filter.c", NULL };
1269 ap_register_output_filter_protocol("proxy-html", proxy_html_filter,
1270 NULL, AP_FTYPE_RESOURCE,
1271 AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH);
1272 /* move this to pre_config so old_expr is available to interpret
1273 * old-style conditions on URL maps.
1275 ap_hook_pre_config(mod_proxy_html, NULL, NULL, APR_HOOK_MIDDLE);
1276 ap_hook_insert_filter(proxy_html_insert, NULL, aszSucc, APR_HOOK_MIDDLE);
1279 AP_DECLARE_MODULE(proxy_html) = {
1280 STANDARD20_MODULE_STUFF,