1 /* Copyright (c) 2007-11, WebThing Ltd
2 * Copyright (c) 2011-, The Apache Software Foundation
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 #define XML2ENC_DECLARE_EXPORT
27 #include <libxml/encoding.h>
29 #include "http_protocol.h"
30 #include "http_config.h"
32 #include "apr_strings.h"
33 #include "apr_xlate.h"
35 #include "apr_optional.h"
36 #include "mod_xml2enc.h"
38 module AP_MODULE_DECLARE_DATA xml2enc_module;
42 #define APR_BRIGADE_DO(b,bb) for (b = APR_BRIGADE_FIRST(bb); \
43 b != APR_BRIGADE_SENTINEL(bb); \
44 b = APR_BUCKET_NEXT(b))
46 #define ENC_INITIALISED 0x100
47 #define ENC_SEEN_EOS 0x200
48 #define ENC_SKIPTO ENCIO_SKIPTO
50 #define HAVE_ENCODING(enc) \
51 (((enc)!=XML_CHAR_ENCODING_NONE)&&((enc)!=XML_CHAR_ENCODING_ERROR))
54 * XXX: Check all those ap_assert()s ans replace those that should not happen
55 * XXX: with AP_DEBUG_ASSERT and those that may happen with proper error
59 xmlCharEncoding xml2enc;
65 apr_bucket_brigade* bbnext;
66 apr_bucket_brigade* bbsave;
71 const char* default_charset;
72 xmlCharEncoding default_encoding;
73 apr_array_header_t* skipto;
80 static ap_regex_t* seek_meta_ctype;
81 static ap_regex_t* seek_charset;
83 static apr_status_t xml2enc_filter(request_rec* r, const char* enc,
86 /* set up a ready-initialised ctx to convert to enc, and insert filter */
89 unsigned int flags = (mode ^ ENCIO);
90 if ((mode & ENCIO) == ENCIO_OUTPUT) {
91 rv = apr_xlate_open(&convset, enc, "UTF-8", r->pool);
92 flags |= ENC_INITIALISED;
94 else if ((mode & ENCIO) == ENCIO_INPUT) {
95 rv = apr_xlate_open(&convset, "UTF-8", enc, r->pool);
96 flags |= ENC_INITIALISED;
98 else if ((mode & ENCIO) == ENCIO_INPUT_CHECKS) {
100 rv = APR_SUCCESS; /* we'll initialise later by sniffing */
104 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01426)
105 "xml2enc: bad mode %x", mode);
107 if (rv == APR_SUCCESS) {
108 xml2ctx* ctx = apr_pcalloc(r->pool, sizeof(xml2ctx));
110 if (flags & ENC_INITIALISED) {
111 ctx->convset = convset;
113 ctx->buf = apr_palloc(r->pool, (apr_size_t)ctx->bblen);
115 ap_add_output_filter("xml2enc", ctx, r, r->connection);
118 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r, APLOGNO(01427)
119 "xml2enc: Charset %s not supported.", enc) ;
124 /* This needs to operate only when we're using htmlParser */
125 /* Different modules may apply different rules here. Ho, hum. */
126 static void fix_skipto(request_rec* r, xml2ctx* ctx)
129 xml2cfg* cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
130 if ((cfg->skipto != NULL) && (ctx->flags | ENC_SKIPTO)) {
132 char* p = ap_strchr(ctx->buf, '<');
133 tattr* starts = (tattr*) cfg->skipto->elts;
134 while (!found && p && *p) {
136 for (i = 0; i < cfg->skipto->nelts; ++i) {
137 if (!strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) {
138 /* found a starting element. Strip all that comes before. */
141 rv = apr_brigade_partition(ctx->bbsave, (p-ctx->buf),
143 ap_assert(rv == APR_SUCCESS);
144 while (b = APR_BRIGADE_FIRST(ctx->bbsave), b != bstart) {
145 APR_BUCKET_REMOVE(b);
146 apr_bucket_destroy(b);
148 ctx->bytes -= (p-ctx->buf);
151 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01428)
152 "Skipped to first <%s> element",
157 p = ap_strchr(p+1, '<');
160 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, APLOGNO(01429)
161 "Failed to find start of recognised HTML!");
165 static void sniff_encoding(request_rec* r, xml2ctx* ctx)
167 xml2cfg* cfg = NULL; /* initialise to shut compiler warnings up */
172 ap_regmatch_t match[2] ;
174 const char* ctype = r->content_type;
177 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01430)
178 "Content-Type is %s", ctype) ;
180 /* If we've got it in the HTTP headers, there's nothing to do */
181 if (ctype && (p = ap_strcasestr(ctype, "charset=") , p != NULL)) {
183 if (ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ),
185 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01431)
186 "Got charset %s from HTTP headers", ctx->encoding) ;
187 ctx->xml2enc = xmlParseCharEncoding(ctx->encoding);
192 /* to sniff, first we look for BOM */
193 if (ctx->xml2enc == XML_CHAR_ENCODING_NONE) {
194 ctx->xml2enc = xmlDetectCharEncoding((const xmlChar*)ctx->buf,
196 if (HAVE_ENCODING(ctx->xml2enc)) {
197 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01432)
198 "Got charset from XML rules.") ;
199 ctx->encoding = xmlGetCharEncodingName(ctx->xml2enc);
203 /* If none of the above, look for a META-thingey */
204 /* also we're probably about to invalidate it, so we remove it. */
205 if (ap_regexec(seek_meta_ctype, ctx->buf, 1, match, 0) == 0 ) {
206 /* get markers on the start and end of the match */
207 rv = apr_brigade_partition(ctx->bbsave, match[0].rm_eo, &cute);
208 ap_assert(rv == APR_SUCCESS);
209 rv = apr_brigade_partition(ctx->bbsave, match[0].rm_so, &cutb);
210 ap_assert(rv == APR_SUCCESS);
211 /* now set length of useful buf for start-of-data hooks */
212 ctx->bytes = match[0].rm_so;
213 if (ctx->encoding == NULL) {
214 p = apr_pstrndup(r->pool, ctx->buf + match[0].rm_so,
215 match[0].rm_eo - match[0].rm_so) ;
216 if (ap_regexec(seek_charset, p, 2, match, 0) == 0) {
217 if (ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
218 match[1].rm_eo - match[1].rm_so),
220 ctx->xml2enc = xmlParseCharEncoding(ctx->encoding);
221 if (HAVE_ENCODING(ctx->xml2enc))
222 ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r, APLOGNO(01433)
223 "Got charset %s from HTML META", ctx->encoding) ;
228 /* cut out the <meta> we're invalidating */
229 while (cutb != cute) {
230 b = APR_BUCKET_NEXT(cutb);
231 APR_BUCKET_REMOVE(cutb);
232 apr_bucket_destroy(cutb);
235 /* and leave a string */
236 ctx->buf[ctx->bytes] = 0;
239 /* either it's set to something we found or it's still the default */
240 /* Aaargh! libxml2 has undocumented <META-crap> support. So this fails
241 * if metafix is not active. Have to make it conditional.
243 * No, that means no-metafix breaks things. Deal immediately with
244 * this particular instance of metafix.
246 if (!HAVE_ENCODING(ctx->xml2enc)) {
247 cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
248 if (!ctx->encoding) {
249 ctx->encoding = cfg->default_charset?cfg->default_charset:"ISO-8859-1";
251 /* Unsupported charset. Can we get (iconv) support through apr_xlate? */
252 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r, APLOGNO(01434)
253 "Charset %s not supported by libxml2; trying apr_xlate",
255 if (apr_xlate_open(&ctx->convset, "UTF-8", ctx->encoding, r->pool)
257 ctx->xml2enc = XML_CHAR_ENCODING_UTF8 ;
259 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r, APLOGNO(01435)
260 "Charset %s not supported. Consider aliasing it?",
265 if (!HAVE_ENCODING(ctx->xml2enc)) {
266 /* Use configuration default as a last resort */
267 ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r, APLOGNO(01436)
268 "No usable charset information; using configuration default");
269 ctx->xml2enc = (cfg->default_encoding == XML_CHAR_ENCODING_NONE)
270 ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ;
272 if (ctype && ctx->encoding) {
273 if (ap_regexec(seek_charset, ctype, 2, match, 0)) {
274 r->content_type = apr_pstrcat(r->pool, ctype, ";charset=utf-8",
277 char* str = apr_palloc(r->pool, strlen(r->content_type) + 13
278 - (match[0].rm_eo - match[0].rm_so) + 1);
279 memcpy(str, r->content_type, match[1].rm_so);
280 memcpy(str + match[1].rm_so, "utf-8", 5);
281 strcpy(str + match[1].rm_so + 5, r->content_type+match[1].rm_eo);
282 r->content_type = str;
287 static apr_status_t xml2enc_filter_init(ap_filter_t* f)
291 xml2cfg* cfg = ap_get_module_config(f->r->per_dir_config,
293 f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(xml2ctx));
294 ctx->xml2enc = XML_CHAR_ENCODING_NONE;
295 if (cfg->skipto != NULL) {
296 ctx->flags |= ENC_SKIPTO;
301 static apr_status_t xml2enc_ffunc(ap_filter_t* f, apr_bucket_brigade* bb)
303 xml2ctx* ctx = f->ctx;
311 if (!ctx || !f->r->content_type) {
312 /* log error about configuring this */
313 ap_remove_output_filter(f);
314 return ap_pass_brigade(f->next, bb) ;
317 ctype = apr_pstrdup(f->r->pool, f->r->content_type);
318 for (p = ctype; *p; ++p)
322 /* only act if starts-with "text/" or contains "xml" */
323 if (strncmp(ctype, "text/", 5) && !strstr(ctype, "xml")) {
324 ap_remove_output_filter(f);
325 return ap_pass_brigade(f->next, bb) ;
328 if (ctx->bbsave == NULL) {
329 ctx->bbsave = apr_brigade_create(f->r->pool,
330 f->r->connection->bucket_alloc);
332 /* append to any data left over from last time */
333 APR_BRIGADE_CONCAT(ctx->bbsave, bb);
335 if (!(ctx->flags & ENC_INITIALISED)) {
336 /* some kind of initialisation required */
337 /* Turn all this off when post-processing */
339 /* if we don't have enough data to sniff but more's to come, wait */
340 apr_brigade_length(ctx->bbsave, 0, &ctx->bblen);
341 if ((ctx->bblen < BUF_MIN) && (ctx->bblen != -1)) {
342 APR_BRIGADE_DO(b, ctx->bbsave) {
343 if (APR_BUCKET_IS_EOS(b)) {
344 ctx->flags |= ENC_SEEN_EOS;
348 if (!(ctx->flags & ENC_SEEN_EOS)) {
349 /* not enough data to sniff. Wait for more */
350 APR_BRIGADE_DO(b, ctx->bbsave) {
351 rv = apr_bucket_setaside(b, f->r->pool);
352 ap_assert(rv == APR_SUCCESS);
357 if (ctx->bblen == -1) {
358 ctx->bblen = BUFLEN-1;
361 /* flatten it into a NULL-terminated string */
362 ctx->buf = apr_palloc(f->r->pool, (apr_size_t)(ctx->bblen+1));
363 ctx->bytes = (apr_size_t)ctx->bblen;
364 rv = apr_brigade_flatten(ctx->bbsave, ctx->buf, &ctx->bytes);
365 ap_assert(rv == APR_SUCCESS);
366 ctx->buf[ctx->bytes] = 0;
367 sniff_encoding(f->r, ctx);
369 /* FIXME: hook here for rewriting start-of-data? */
370 /* nah, we only have one action here - call it inline */
371 fix_skipto(f->r, ctx);
373 /* consume the data we just sniffed */
374 /* we need to omit any <meta> we just invalidated */
375 ctx->flags |= ENC_INITIALISED;
376 ap_set_module_config(f->r->request_config, &xml2enc_module, ctx);
378 if (ctx->bbnext == NULL) {
379 ctx->bbnext = apr_brigade_create(f->r->pool,
380 f->r->connection->bucket_alloc);
384 rv = ap_pass_brigade(f->next, ctx->bbsave);
385 apr_brigade_cleanup(ctx->bbsave);
386 ap_remove_output_filter(f);
389 /* move the data back to bb */
390 APR_BRIGADE_CONCAT(bb, ctx->bbsave);
392 while (b = APR_BRIGADE_FIRST(bb), b != APR_BRIGADE_SENTINEL(bb)) {
394 if (APR_BUCKET_IS_METADATA(b)) {
395 APR_BUCKET_REMOVE(b);
396 if (APR_BUCKET_IS_EOS(b)) {
397 /* send remaining data */
398 APR_BRIGADE_INSERT_TAIL(ctx->bbnext, b);
399 return ap_fflush(f->next, ctx->bbnext);
400 } else if (APR_BUCKET_IS_FLUSH(b)) {
401 ap_fflush(f->next, ctx->bbnext);
403 apr_bucket_destroy(b);
405 else { /* data bucket */
407 apr_size_t bytes = 0;
409 apr_bucket* bdestroy = NULL;
410 if (insz > 0) { /* we have dangling data. Flatten it. */
413 rv = apr_brigade_flatten(bb, buf, &bytes);
414 ap_assert(rv == APR_SUCCESS);
416 /* this is only what we've already tried to convert.
417 * The brigade is exhausted.
418 * Save remaining data for next time round
421 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01437)
422 "xml2enc: Setting aside %" APR_SIZE_T_FMT
423 " unconverted bytes", bytes);
424 rv = ap_fflush(f->next, ctx->bbnext);
425 APR_BRIGADE_CONCAT(ctx->bbsave, bb);
426 APR_BRIGADE_DO(b, ctx->bbsave) {
427 ap_assert(apr_bucket_setaside(b, f->r->pool)
432 /* remove the data we've just read */
433 rv = apr_brigade_partition(bb, bytes, &bstart);
434 while (b = APR_BRIGADE_FIRST(bb), b != bstart) {
435 APR_BUCKET_REMOVE(b);
436 apr_bucket_destroy(b);
438 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01438)
439 "xml2enc: consuming %" APR_SIZE_T_FMT
440 " bytes flattened", bytes);
443 rv = apr_bucket_read(b, (const char**)&buf, &bytes,
445 APR_BUCKET_REMOVE(b);
446 bdestroy = b; /* can't destroy until finished with the data */
447 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01439)
448 "xml2enc: consuming %" APR_SIZE_T_FMT
449 " bytes from bucket", bytes);
451 /* OK, we've got some input we can use in [buf,bytes] */
452 if (rv == APR_SUCCESS) {
454 xml2enc_run_preprocess(f, &buf, &bytes);
455 consumed = insz = bytes;
458 if (ctx->bytes == ctx->bblen) {
459 /* nothing was converted last time!
460 * break out of this loop!
462 b = apr_bucket_transient_create(buf+(bytes - insz), insz,
464 APR_BRIGADE_INSERT_HEAD(bb, b);
465 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01440)
466 "xml2enc: reinserting %" APR_SIZE_T_FMT
467 " unconsumed bytes from bucket", insz);
470 ctx->bytes = (apr_size_t)ctx->bblen;
471 rv = apr_xlate_conv_buffer(ctx->convset, buf+(bytes - insz),
472 &insz, ctx->buf, &ctx->bytes);
473 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, APLOGNO(01441)
474 "xml2enc: converted %" APR_SIZE_T_FMT
475 "/%" APR_OFF_T_FMT " bytes", consumed - insz,
476 ctx->bblen - ctx->bytes);
478 rv2 = ap_fwrite(f->next, ctx->bbnext, ctx->buf,
479 (apr_size_t)ctx->bblen - ctx->bytes);
480 if (rv2 != APR_SUCCESS) {
481 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv2, f->r, APLOGNO(01442)
488 case APR_EINCOMPLETE:
489 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r, APLOGNO(01443)
491 continue; /* If outbuf too small, go round again.
492 * If it was inbuf, we'll break out when
493 * we test ctx->bytes == ctx->bblen
495 case APR_EINVAL: /* try skipping one bad byte */
496 ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r, APLOGNO(01444)
497 "Skipping invalid byte(s) in input stream!");
502 * Bail out, flush, and hope to eat the buf raw
504 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01445)
505 "Failed to convert input; trying it raw") ;
507 rv = ap_fflush(f->next, ctx->bbnext);
508 if (rv != APR_SUCCESS)
509 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r, APLOGNO(01446)
512 rv = ap_pass_brigade(f->next, ctx->bbnext);
516 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r, APLOGNO(01447)
517 "xml2enc: error reading data") ;
520 apr_bucket_destroy(bdestroy);
521 if (rv != APR_SUCCESS)
527 static apr_status_t xml2enc_charset(request_rec* r, xmlCharEncoding* encp,
528 const char** encoding)
530 xml2ctx* ctx = ap_get_module_config(r->request_config, &xml2enc_module);
531 if (!ctx || !(ctx->flags & ENC_INITIALISED)) {
534 *encp = ctx->xml2enc;
535 *encoding = ctx->encoding;
536 return HAVE_ENCODING(ctx->xml2enc) ? APR_SUCCESS : APR_EGENERAL;
539 #define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH
540 static void xml2enc_hooks(apr_pool_t* pool)
542 ap_register_output_filter_protocol("xml2enc", xml2enc_ffunc,
544 AP_FTYPE_RESOURCE, PROTO_FLAGS);
545 APR_REGISTER_OPTIONAL_FN(xml2enc_filter);
546 APR_REGISTER_OPTIONAL_FN(xml2enc_charset);
547 seek_meta_ctype = ap_pregcomp(pool,
548 "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
549 AP_REG_EXTENDED|AP_REG_ICASE) ;
550 seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)",
551 AP_REG_EXTENDED|AP_REG_ICASE) ;
553 static const char* set_alias(cmd_parms* cmd, void* CFG,
554 const char* charset, const char* alias)
556 const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY);
559 else if (xmlAddEncodingAlias(charset, alias) == 0)
562 return "Error setting charset alias";
565 static const char* set_default(cmd_parms* cmd, void* CFG, const char* charset)
568 cfg->default_charset = charset;
569 cfg->default_encoding = xmlParseCharEncoding(charset);
570 switch(cfg->default_encoding) {
571 case XML_CHAR_ENCODING_NONE:
572 return "Default charset not found";
573 case XML_CHAR_ENCODING_ERROR:
574 return "Invalid or unsupported default charset";
579 static const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg)
583 if (cfg->skipto == NULL)
584 cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr));
585 attr = apr_array_push(cfg->skipto) ;
590 static const command_rec xml2enc_cmds[] = {
591 AP_INIT_TAKE1("xml2EncDefault", set_default, NULL, OR_ALL,
592 "Usage: xml2EncDefault charset"),
593 AP_INIT_ITERATE2("xml2EncAlias", set_alias, NULL, RSRC_CONF,
594 "EncodingAlias charset alias [more aliases]"),
595 AP_INIT_ITERATE("xml2StartParse", set_skipto, NULL, OR_ALL,
596 "Ignore anything in front of the first of these elements"),
599 static void* xml2enc_config(apr_pool_t* pool, char* x)
601 xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg));
602 ret->default_encoding = XML_CHAR_ENCODING_NONE ;
606 static void* xml2enc_merge(apr_pool_t* pool, void* BASE, void* ADD)
608 xml2cfg* base = BASE;
610 xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg));
611 ret->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE)
612 ? base->default_encoding : add->default_encoding ;
613 ret->default_charset = add->default_charset
614 ? add->default_charset : base->default_charset;
615 ret->skipto = add->skipto ? add->skipto : base->skipto;
619 AP_DECLARE_MODULE(xml2enc) = {
620 STANDARD20_MODULE_STUFF,
629 APR_IMPLEMENT_OPTIONAL_HOOK_RUN_ALL(xml2enc, XML2ENC, int, preprocess,
630 (ap_filter_t *f, char** bufp, apr_size_t* bytesp),
631 (f, bufp, bytesp), OK, DECLINED)