--- /dev/null
+<?xml version="1.0"?>
+<!DOCTYPE modulesynopsis SYSTEM "../style/modulesynopsis.dtd">
+<?xml-stylesheet type="text/xsl" href="../style/manual.en.xsl"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<modulesynopsis metafile="mod_xml2enc.xml.meta">
+
+<name>mod_xml2enc</name>
+<description>Enhanced charset/internationalisation support for libxml2-based
+filter modules</description>
+<status>Base</status>
+<sourcefile>mod_xml2enc.c</sourcefile>
+<identifier>xml2enc_module</identifier>
+<compatibility>Version 2.4 and later. Available as a third-party module
+for 2.2.x versions</compatibility>
+
+<summary>
+ <p>This module provides enhanced internationalisation support for
+ markup-aware filter modules such as <module>mod_proxy_html</module>.
+ It can automatically detect the encoding of input data and ensure
+ they are correctly processed by the <a href="http://xmlsoft.org/"
+ >libxml2</a> parser, including converting to Unicode (UTF-8) where
+ necessary. It can also convert data to an encoding of choice
+ after markup processing, and will ensure the correct <var>charset</var>
+ value is set in the HTTP <var>Content-Type</var> header.</p>
+</summary>
+
+<section id="usage"><title>Usage</title>
+ <p>There are two usage scenarios: with modules programmed to work
+ with mod_xml2enc, and with those that are not aware of it:</p>
+ <dl>
+ <dt>Filter modules enabled for mod_xml2enc</dt><dd>
+ <p>Modules such as <module>mod_proxy_html</module> version 3.1
+ and up use the <code>xml2enc_charset</code> optional function to retrieve
+ the charset argument to pass to the libxml2 parser, and may use the
+ <code>xml2enc_filter</code> optional function to postprocess to another
+ encoding. Using mod_xml2enc with an enabled module, no configuration
+ is necessary: the other module will configure mod_xml2enc for you
+ (though you may still want to customise it using the configuration
+ directives below).</p>
+ </dd>
+ <dt>Non-enabled modules</dt><dd>
+ <p>To use it with a libxml2-based module that isn't explicitly enabled for
+ mod_xml2enc, you will have to configure the filter chain yourself.
+ So to use it with a filter foo provided by a module mod_foo to
+ improve the latter's i18n support with HTML and XML, you could use</p>
+ <pre><code>
+ FilterProvider iconv xml2enc Content-Type $text/html
+ FilterProvider iconv xml2enc Content-Type $xml
+ FilterProvider markup foo Content-Type $text/html
+ FilterProvider markup foo Content-Type $xml
+ FilterChain iconv markup
+ </code></pre>
+ <p>mod_foo will now support any character set supported by either
+ (or both) of libxml2 or apr_xlate/iconv.</p>
+ </dd></dl>
+</section>
+
+<section id="api"><title>Programming API</title>
+ <p>Programmers writing libxml2-based filter modules are encouraged to
+ enable them for mod_xml2enc, to provide strong i18n support for your
+ users without reinventing the wheel. The programming API is exposed in
+ <var>mod_xml2enc.h</var>, and a usage example is
+ <module>mod_proxy_html</module>.</p>
+</section>
+
+<section id="sniffing"><title>Detecting an Encoding</title>
+ <p>Unlike <module>mod_charset_lite</module>, mod_xml2enc is designed
+ to work with data whose encoding cannot be known in advance and thus
+ configured. It therefore uses 'sniffing' techniques to detect the
+ encoding of HTTP data as follows:</p>
+ <ol>
+ <li>If the HTTP <var>Content-Type</var> header includes a
+ <var>charset</var> parameter, that is used.</li>
+ <li>If the data start with an XML Byte Order Mark (BOM) or an
+ XML encoding declaration, that is used.</li>
+ <li>If an encoding is declared in an HTML <code><META></code>
+ element, that is used.</li>
+ <li>If none of the above match, the default value set by
+ <directive>xml2EncDefault</directive> is used.</li>
+ </ol>
+ <p>The rules are applied in order. As soon as a match is found,
+ it is used and detection is stopped.</p>
+</section>
+
+<section id="output"><title>Output Encoding</title>
+<p><a href="http://xmlsoft.org/">libxml2</a> always uses UTF-8 (Unicode)
+internally, and libxml2-based filter modules will output that by default.
+mod_xml2enc can change the output encoding through the API, but there
+is currently no way to configure that directly.</p>
+<p>Changing the output encoding should (in theory, at least) never be
+necessary, and is not recommended due to the extra processing load on
+the server of an unnecessary conversion.</p>
+</section>
+
+<section id="alias"><title>Unsupported Encodings</title>
+<p>If you are working with encodings that are not supported by any of
+the conversion methods available on your platform, you can still alias
+them to a supported encoding using <directive>xml2EncAlias</directive>.</p>
+</section>
+
+<directivesynopsis>
+<name>xml2EncDefault</name>
+<description>Sets a default encoding to assume when absolutely no information
+can be <a href="#sniffing">automatically detected</a></description>
+<syntax>xml2EncDefault <var>name</var></syntax>
+<contextlist><context>server config</context>
+<context>virtual host</context><context>directory</context>
+<context>.htaccess</context></contextlist>
+<compatibility>Version 2.4.0 and later; available as a third-party
+module for earlier versions.</compatibility>
+
+<usage>
+ <p>If you are processing data with known encoding but no encoding
+ information, you can set this default to help mod_xml2enc process
+ the data correctly. For example, to work with the default value
+ of Latin1 (<var>iso-8859-1</var> specified in HTTP/1.0, use</p>
+ <example>xml2EncDefault iso-8859-1</example>
+</usage>
+</directivesynopsis>
+
+<directivesynopsis>
+<name>xml2EncAlias</name>
+<description>Recognise Aliases for encoding values</description>
+<syntax>xml2EncAlias <var>charset alias [alias ...]</var></syntax>
+<contextlist><context>server config</context></contextlist>
+
+<usage>
+ <p>This server-wide directive aliases one or more encoding to another
+ encoding. This enables encodings not recognised by libxml2 to be handled
+ internally by libxml2's encoding support using the translation table for
+ a recognised encoding. This serves two purposes: to support character sets
+ (or names) not recognised either by libxml2 or iconv, and to skip
+ conversion for an encoding where it is known to be unnecessary.</p>
+</usage>
+</directivesynopsis>
+
+<directivesynopsis>
+<name>xml2StartParse</name>
+<description>Advise the parser to skip leading junk.</description>
+<syntax>xml2StartParse <var>element [element ...]</var></syntax>
+<contextlist><context>server config</context><context>virtual host</context>
+<context>directory</context><context>.htaccess</context></contextlist>
+
+<usage>
+ <p>Specify that the markup parser should start at the first instance
+ of any of the elements specified. This can be used as a workaround
+ where a broken backend inserts leading junk that messes up the parser (<a
+ href="http://bahumbug.wordpress.com/2006/10/12/mod_proxy_html-revisited/"
+ >example here</a>).</p>
+ <p>It should never be used for XML, nor well-formed HTML.</p>
+</usage>
+</directivesynopsis>
+
+</modulesynopsis>
+
--- /dev/null
+/* Copyright (c) 2007-11, WebThing Ltd
+ * Copyright (c) 2011-, The Apache Software Foundation
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#if defined(WIN32)
+#define XML2ENC_DECLARE_EXPORT
+#endif
+
+#include <ctype.h>
+
+/* libxml2 */
+#include <libxml/encoding.h>
+
+#include "http_protocol.h"
+#include "http_config.h"
+#include "http_log.h"
+#include "apr_strings.h"
+#include "apr_xlate.h"
+
+#include "apr_optional.h"
+#include "mod_xml2enc.h"
+
+module AP_MODULE_DECLARE_DATA xml2enc_module;
+
+#define BUFLEN 8192
+#define BUF_MIN 4096
+#define APR_BRIGADE_DO(b,bb) for (b = APR_BRIGADE_FIRST(bb); \
+ b != APR_BRIGADE_SENTINEL(bb); \
+ b = APR_BUCKET_NEXT(b))
+
+#define ENC_INITIALISED 0x100
+#define ENC_SEEN_EOS 0x200
+#define ENC_SKIPTO ENCIO_SKIPTO
+
+#define HAVE_ENCODING(enc) \
+ (((enc)!=XML_CHAR_ENCODING_NONE)&&((enc)!=XML_CHAR_ENCODING_ERROR))
+
+typedef struct {
+ xmlCharEncoding xml2enc;
+ char* buf;
+ apr_size_t bytes;
+ apr_xlate_t* convset;
+ unsigned int flags;
+ apr_off_t bblen;
+ apr_bucket_brigade* bbnext;
+ apr_bucket_brigade* bbsave;
+ const char* encoding;
+} xml2ctx;
+
+typedef struct {
+ const char* default_charset;
+ xmlCharEncoding default_encoding;
+ apr_array_header_t* skipto;
+} xml2cfg;
+
+typedef struct {
+ const char* val;
+} tattr;
+
+static ap_regex_t* seek_meta_ctype;
+static ap_regex_t* seek_charset;
+
+static apr_status_t xml2enc_filter(request_rec* r, const char* enc,
+ unsigned int mode)
+{
+ /* set up a ready-initialised ctx to convert to enc, and insert filter */
+ apr_xlate_t* convset;
+ apr_status_t rv;
+ unsigned int flags = (mode ^ ENCIO);
+ if ((mode & ENCIO) == ENCIO_OUTPUT) {
+ rv = apr_xlate_open(&convset, enc, "UTF-8", r->pool);
+ flags |= ENC_INITIALISED;
+ }
+ else if ((mode & ENCIO) == ENCIO_INPUT) {
+ rv = apr_xlate_open(&convset, "UTF-8", enc, r->pool);
+ flags |= ENC_INITIALISED;
+ }
+ else if ((mode & ENCIO) == ENCIO_INPUT_CHECKS) {
+ convset = NULL;
+ rv = APR_SUCCESS; /* we'll initialise later by sniffing */
+ }
+ else {
+ rv = APR_EGENERAL;
+ ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
+ "xml2enc: bad mode %x", mode);
+ }
+ if (rv == APR_SUCCESS) {
+ xml2ctx* ctx = apr_pcalloc(r->pool, sizeof(xml2ctx));
+ ctx->flags = flags;
+ if (flags & ENC_INITIALISED) {
+ ctx->convset = convset;
+ ctx->bblen = BUFLEN;
+ ctx->buf = apr_palloc(r->pool, (apr_size_t)ctx->bblen);
+ }
+ ap_add_output_filter("xml2enc", ctx, r, r->connection);
+ }
+ else {
+ ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r,
+ "xml2enc: Charset %s not supported.", enc) ;
+ }
+ return rv;
+}
+
+/* This needs to operate only when we're using htmlParser */
+/* Different modules may apply different rules here. Ho, hum. */
+static void fix_skipto(request_rec* r, xml2ctx* ctx)
+{
+ apr_status_t rv;
+ xml2cfg* cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
+ if ((cfg->skipto != NULL) && (ctx->flags | ENC_SKIPTO)) {
+ int found = 0;
+ char* p = ap_strchr(ctx->buf, '<');
+ tattr* starts = (tattr*) cfg->skipto->elts;
+ while (!found && p && *p) {
+ int i;
+ for (i = 0; i < cfg->skipto->nelts; ++i) {
+ if (!strncasecmp(p+1, starts[i].val, strlen(starts[i].val))) {
+ /* found a starting element. Strip all that comes before. */
+ apr_bucket* b;
+ apr_bucket* bstart;
+ rv = apr_brigade_partition(ctx->bbsave, (p-ctx->buf),
+ &bstart);
+ while (b = APR_BRIGADE_FIRST(ctx->bbsave), b != bstart) {
+ APR_BUCKET_REMOVE(b);
+ apr_bucket_destroy(b);
+ }
+ ctx->bytes -= (p-ctx->buf);
+ ctx->buf = p ;
+ found = 1;
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
+ "Skipped to first <%s> element",
+ starts[i].val) ;
+ break;
+ }
+ }
+ p = ap_strchr(p+1, '<');
+ }
+ if (p == NULL) {
+ ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r,
+ "Failed to find start of recognised HTML!");
+ }
+ }
+}
+static void sniff_encoding(request_rec* r, xml2ctx* ctx)
+{
+ xml2cfg* cfg = NULL; /* initialise to shut compiler warnings up */
+ char* p ;
+ apr_bucket* cutb;
+ apr_bucket* cute;
+ apr_bucket* b;
+ ap_regmatch_t match[2] ;
+ apr_status_t rv;
+ const char* ctype = r->content_type;
+
+ if (ctype) {
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
+ "Content-Type is %s", ctype) ;
+
+ /* If we've got it in the HTTP headers, there's nothing to do */
+ if (ctype && (p = ap_strcasestr(ctype, "charset=") , p != NULL)) {
+ p += 8 ;
+ if (ctx->encoding = apr_pstrndup(r->pool, p, strcspn(p, " ;") ),
+ ctx->encoding) {
+ ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
+ "Got charset %s from HTTP headers", ctx->encoding) ;
+ ctx->xml2enc = xmlParseCharEncoding(ctx->encoding);
+ }
+ }
+ }
+
+ /* to sniff, first we look for BOM */
+ if (ctx->xml2enc == XML_CHAR_ENCODING_NONE) {
+ ctx->xml2enc = xmlDetectCharEncoding((const xmlChar*)ctx->buf,
+ ctx->bytes);
+ if (HAVE_ENCODING(ctx->xml2enc)) {
+ ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
+ "Got charset from XML rules.") ;
+ ctx->encoding = xmlGetCharEncodingName(ctx->xml2enc);
+ }
+ }
+
+ /* If none of the above, look for a META-thingey */
+ /* also we're probably about to invalidate it, so we remove it. */
+ if (ap_regexec(seek_meta_ctype, ctx->buf, 1, match, 0) == 0 ) {
+ /* get markers on the start and end of the match */
+ rv = apr_brigade_partition(ctx->bbsave, match[0].rm_eo, &cute);
+ rv = apr_brigade_partition(ctx->bbsave, match[0].rm_so, &cutb);
+ /* now set length of useful buf for start-of-data hooks */
+ ctx->bytes = match[0].rm_so;
+ if (ctx->encoding == NULL) {
+ p = apr_pstrndup(r->pool, ctx->buf + match[0].rm_so,
+ match[0].rm_eo - match[0].rm_so) ;
+ if (ap_regexec(seek_charset, p, 2, match, 0) == 0) {
+ if (ctx->encoding = apr_pstrndup(r->pool, p+match[1].rm_so,
+ match[1].rm_eo - match[1].rm_so),
+ ctx->encoding) {
+ ctx->xml2enc = xmlParseCharEncoding(ctx->encoding);
+ if (HAVE_ENCODING(ctx->xml2enc))
+ ap_log_rerror(APLOG_MARK, APLOG_INFO, 0, r,
+ "Got charset %s from HTML META", ctx->encoding) ;
+ }
+ }
+ }
+
+ /* cut out the <meta> we're invalidating */
+ while (cutb != cute) {
+ b = APR_BUCKET_NEXT(cutb);
+ APR_BUCKET_REMOVE(cutb);
+ apr_bucket_destroy(cutb);
+ cutb = b;
+ }
+ /* and leave a string */
+ ctx->buf[ctx->bytes] = 0;
+ }
+
+ /* either it's set to something we found or it's still the default */
+ /* Aaargh! libxml2 has undocumented <META-crap> support. So this fails
+ * if metafix is not active. Have to make it conditional.
+ *
+ * No, that means no-metafix breaks things. Deal immediately with
+ * this particular instance of metafix.
+ */
+ if (!HAVE_ENCODING(ctx->xml2enc)) {
+ cfg = ap_get_module_config(r->per_dir_config, &xml2enc_module);
+ if (!ctx->encoding) {
+ ctx->encoding = cfg->default_charset?cfg->default_charset:"ISO-8859-1";
+ }
+ /* Unsupported charset. Can we get (iconv) support through apr_xlate? */
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
+ "Charset %s not supported by libxml2; trying apr_xlate",
+ ctx->encoding);
+ if (apr_xlate_open(&ctx->convset, "UTF-8", ctx->encoding, r->pool)
+ == APR_SUCCESS) {
+ ctx->xml2enc = XML_CHAR_ENCODING_UTF8 ;
+ } else {
+ ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, r,
+ "Charset %s not supported. Consider aliasing it?",
+ ctx->encoding) ;
+ }
+ }
+
+ if (!HAVE_ENCODING(ctx->xml2enc)) {
+ /* Use configuration default as a last resort */
+ ap_log_rerror(APLOG_MARK, APLOG_WARNING, 0, r,
+ "No usable charset information; using configuration default");
+ ctx->xml2enc = (cfg->default_encoding == XML_CHAR_ENCODING_NONE)
+ ? XML_CHAR_ENCODING_8859_1 : cfg->default_encoding ;
+ }
+ if (ctype && ctx->encoding) {
+ if (ap_regexec(seek_charset, ctype, 2, match, 0)) {
+ r->content_type = apr_pstrcat(r->pool, ctype, ";charset=utf-8",
+ NULL);
+ } else {
+ char* str = apr_palloc(r->pool, strlen(r->content_type) + 13
+ - (match[0].rm_eo - match[0].rm_so) + 1);
+ memcpy(str, r->content_type, match[1].rm_so);
+ memcpy(str + match[1].rm_so, "utf-8", 5);
+ strcpy(str + match[1].rm_so + 5, r->content_type+match[1].rm_eo);
+ r->content_type = str;
+ }
+ }
+}
+
+static apr_status_t xml2enc_filter_init(ap_filter_t* f)
+{
+ xml2ctx* ctx;
+ if (!f->ctx) {
+ xml2cfg* cfg = ap_get_module_config(f->r->per_dir_config,
+ &xml2enc_module);
+ f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(xml2ctx));
+ ctx->xml2enc = XML_CHAR_ENCODING_NONE;
+ if (cfg->skipto != NULL) {
+ ctx->flags |= ENC_SKIPTO;
+ }
+ }
+ return APR_SUCCESS;
+}
+static apr_status_t xml2enc_ffunc(ap_filter_t* f, apr_bucket_brigade* bb)
+{
+ xml2ctx* ctx = f->ctx;
+ apr_status_t rv;
+ apr_bucket* b;
+ apr_bucket* bstart;
+ apr_size_t insz = 0;
+ char *ctype;
+ char *p;
+
+ if (!ctx || !f->r->content_type) {
+ /* log error about configuring this */
+ ap_remove_output_filter(f);
+ return ap_pass_brigade(f->next, bb) ;
+ }
+
+ ctype = apr_pstrdup(f->r->pool, f->r->content_type);
+ for (p = ctype; *p; ++p)
+ if (isupper(*p))
+ *p = tolower(*p);
+
+ /* only act if starts-with "text/" or contains "xml" */
+ if (strncmp(ctype, "text/", 5) && !strstr(ctype, "xml")) {
+ ap_remove_output_filter(f);
+ return ap_pass_brigade(f->next, bb) ;
+ }
+
+ if (ctx->bbsave == NULL) {
+ ctx->bbsave = apr_brigade_create(f->r->pool,
+ f->r->connection->bucket_alloc);
+ }
+ /* append to any data left over from last time */
+ APR_BRIGADE_CONCAT(ctx->bbsave, bb);
+
+ if (!(ctx->flags & ENC_INITIALISED)) {
+ /* some kind of initialisation required */
+ /* Turn all this off when post-processing */
+
+ /* if we don't have enough data to sniff but more's to come, wait */
+ rv = apr_brigade_length(ctx->bbsave, 0, &ctx->bblen);
+ if ((ctx->bblen < BUF_MIN) && (ctx->bblen != -1)) {
+ APR_BRIGADE_DO(b, ctx->bbsave) {
+ if (APR_BUCKET_IS_EOS(b)) {
+ ctx->flags |= ENC_SEEN_EOS;
+ break;
+ }
+ }
+ if (!(ctx->flags & ENC_SEEN_EOS)) {
+ /* not enough data to sniff. Wait for more */
+ APR_BRIGADE_DO(b, ctx->bbsave) {
+ apr_bucket_setaside(b, f->r->pool);
+ }
+ return APR_SUCCESS;
+ }
+ }
+ if (ctx->bblen == -1) {
+ ctx->bblen = BUFLEN-1;
+ }
+
+ /* flatten it into a NULL-terminated string */
+ ctx->buf = apr_palloc(f->r->pool, (apr_size_t)(ctx->bblen+1));
+ ctx->bytes = (apr_size_t)ctx->bblen;
+ rv = apr_brigade_flatten(ctx->bbsave, ctx->buf, &ctx->bytes);
+ ctx->buf[ctx->bytes] = 0;
+ sniff_encoding(f->r, ctx);
+
+ /* FIXME: hook here for rewriting start-of-data? */
+ /* nah, we only have one action here - call it inline */
+ fix_skipto(f->r, ctx);
+
+ /* consume the data we just sniffed */
+ /* we need to omit any <meta> we just invalidated */
+ ctx->flags |= ENC_INITIALISED;
+ ap_set_module_config(f->r->request_config, &xml2enc_module, ctx);
+ }
+ if (ctx->bbnext == NULL) {
+ ctx->bbnext = apr_brigade_create(f->r->pool,
+ f->r->connection->bucket_alloc);
+ }
+
+ if (!ctx->convset) {
+ rv = ap_pass_brigade(f->next, ctx->bbsave);
+ apr_brigade_cleanup(ctx->bbsave);
+ ap_remove_output_filter(f);
+ return rv;
+ }
+ /* move the data back to bb */
+ APR_BRIGADE_CONCAT(bb, ctx->bbsave);
+
+ while (b = APR_BRIGADE_FIRST(bb), b != APR_BRIGADE_SENTINEL(bb)) {
+ ctx->bytes = 0;
+ if (APR_BUCKET_IS_METADATA(b)) {
+ if (APR_BUCKET_IS_EOS(b)) {
+ /* send remaining data */
+ return ap_fflush(f->next, ctx->bbnext);
+ } else if (APR_BUCKET_IS_FLUSH(b)) {
+ ap_fflush(f->next, ctx->bbnext);
+ }
+ APR_BUCKET_REMOVE(b);
+ apr_bucket_destroy(b);
+ }
+ else { /* data bucket */
+ char* buf;
+ apr_size_t bytes = 0;
+ char fixbuf[BUFLEN];
+ apr_bucket* bdestroy = NULL;
+ if (insz > 0) { /* we have dangling data. Flatten it. */
+ buf = fixbuf;
+ bytes = BUFLEN;
+ rv = apr_brigade_flatten(bb, buf, &bytes);
+ if (bytes == insz) {
+ /* this is only what we've already tried to convert.
+ * The brigade is exhausted.
+ * Save remaining data for next time round
+ */
+
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
+ "xml2enc: Setting aside %" APR_SIZE_T_FMT
+ " unconverted bytes", bytes);
+ rv = ap_fflush(f->next, ctx->bbnext);
+ APR_BRIGADE_CONCAT(ctx->bbsave, bb);
+ APR_BRIGADE_DO(b, ctx->bbsave) {
+ apr_bucket_setaside(b, f->r->pool);
+ }
+ return rv;
+ }
+ /* remove the data we've just read */
+ rv = apr_brigade_partition(bb, bytes, &bstart);
+ while (b = APR_BRIGADE_FIRST(bb), b != bstart) {
+ APR_BUCKET_REMOVE(b);
+ apr_bucket_destroy(b);
+ }
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
+ "xml2enc: consuming %" APR_SIZE_T_FMT
+ " bytes flattened", bytes);
+ }
+ else {
+ rv = apr_bucket_read(b, (const char**)&buf, &bytes,
+ APR_BLOCK_READ);
+ APR_BUCKET_REMOVE(b);
+ bdestroy = b; /* can't destroy until finished with the data */
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
+ "xml2enc: consuming %" APR_SIZE_T_FMT
+ " bytes from bucket", bytes);
+ }
+ /* OK, we've got some input we can use in [buf,bytes] */
+ if (rv == APR_SUCCESS) {
+ apr_size_t consumed;
+ xml2enc_run_preprocess(f, &buf, &bytes);
+ consumed = insz = bytes;
+ while (insz > 0) {
+ if (ctx->bytes == ctx->bblen) {
+ /* nothing was converted last time!
+ * break out of this loop!
+ */
+ b = apr_bucket_transient_create(buf+(bytes - insz), insz,
+ bb->bucket_alloc);
+ APR_BRIGADE_INSERT_HEAD(bb, b);
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
+ "xml2enc: reinserting %" APR_SIZE_T_FMT
+ " unconsumed bytes from bucket", insz);
+ break;
+ }
+ ctx->bytes = (apr_size_t)ctx->bblen;
+ rv = apr_xlate_conv_buffer(ctx->convset, buf+(bytes - insz),
+ &insz, ctx->buf, &ctx->bytes);
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, rv, f->r,
+ "xml2enc: converted %" APR_SIZE_T_FMT
+ "/%" APR_OFF_T_FMT " bytes", consumed - insz,
+ ctx->bblen - ctx->bytes);
+ consumed = insz;
+ ap_fwrite(f->next, ctx->bbnext, ctx->buf,
+ (apr_size_t)ctx->bblen - ctx->bytes);
+ switch (rv) {
+ case APR_SUCCESS:
+ continue;
+ case APR_EINCOMPLETE:
+ ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
+ "INCOMPLETE");
+ continue; /* If outbuf too small, go round again.
+ * If it was inbuf, we'll break out when
+ * we test ctx->bytes == ctx->bblen
+ */
+ case APR_EINVAL: /* try skipping one bad byte */
+ ap_log_rerror(APLOG_MARK, APLOG_ERR, 0, f->r,
+ "Skipping invalid byte(s) in input stream!");
+ --insz;
+ continue;
+ default:
+ /* Erk! What's this?
+ * Bail out, flush, and hope to eat the buf raw
+ */
+ ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r,
+ "Failed to convert input; trying it raw") ;
+ ctx->convset = NULL;
+ ap_fflush(f->next, ctx->bbnext);
+ return ap_pass_brigade(f->next, ctx->bbnext);
+ }
+ }
+ } else {
+ ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r,
+ "xml2enc: error reading data") ;
+ }
+ if (bdestroy) {
+ apr_bucket_destroy(bdestroy);
+ }
+ }
+ }
+ return APR_SUCCESS;
+}
+static apr_status_t xml2enc_charset(request_rec* r, xmlCharEncoding* encp,
+ const char** encoding)
+{
+ xml2ctx* ctx = ap_get_module_config(r->request_config, &xml2enc_module);
+ if (!ctx || !(ctx->flags & ENC_INITIALISED)) {
+ return APR_EAGAIN;
+ }
+ *encp = ctx->xml2enc;
+ *encoding = ctx->encoding;
+ return HAVE_ENCODING(ctx->xml2enc) ? APR_SUCCESS : APR_EGENERAL;
+}
+
+#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH
+static void xml2enc_hooks(apr_pool_t* pool)
+{
+ ap_register_output_filter_protocol("xml2enc", xml2enc_ffunc,
+ xml2enc_filter_init,
+ AP_FTYPE_RESOURCE, PROTO_FLAGS);
+ APR_REGISTER_OPTIONAL_FN(xml2enc_filter);
+ APR_REGISTER_OPTIONAL_FN(xml2enc_charset);
+ seek_meta_ctype = ap_pregcomp(pool,
+ "(<meta[^>]*http-equiv[ \t\r\n='\"]*content-type[^>]*>)",
+ AP_REG_EXTENDED|AP_REG_ICASE) ;
+ seek_charset = ap_pregcomp(pool, "charset=([A-Za-z0-9_-]+)",
+ AP_REG_EXTENDED|AP_REG_ICASE) ;
+}
+static const char* set_alias(cmd_parms* cmd, void* CFG,
+ const char* charset, const char* alias)
+{
+ const char* errmsg = ap_check_cmd_context(cmd, GLOBAL_ONLY);
+ if (errmsg != NULL)
+ return errmsg ;
+ else if (xmlAddEncodingAlias(charset, alias) == 0)
+ return NULL;
+ else
+ return "Error setting charset alias";
+}
+
+static const char* set_default(cmd_parms* cmd, void* CFG, const char* charset)
+{
+ xml2cfg* cfg = CFG;
+ cfg->default_charset = charset;
+ cfg->default_encoding = xmlParseCharEncoding(charset);
+ switch(cfg->default_encoding) {
+ case XML_CHAR_ENCODING_NONE:
+ return "Default charset not found";
+ case XML_CHAR_ENCODING_ERROR:
+ return "Invalid or unsupported default charset";
+ default:
+ return NULL;
+ }
+}
+static const char* set_skipto(cmd_parms* cmd, void* CFG, const char* arg)
+{
+ tattr* attr;
+ xml2cfg* cfg = CFG;
+ if (cfg->skipto == NULL)
+ cfg->skipto = apr_array_make(cmd->pool, 4, sizeof(tattr));
+ attr = apr_array_push(cfg->skipto) ;
+ attr->val = arg;
+ return NULL;
+}
+
+static const command_rec xml2enc_cmds[] = {
+ AP_INIT_TAKE1("xml2EncDefault", set_default, NULL, OR_ALL,
+ "Usage: xml2EncDefault charset"),
+ AP_INIT_ITERATE2("xml2EncAlias", set_alias, NULL, RSRC_CONF,
+ "EncodingAlias charset alias [more aliases]"),
+ AP_INIT_ITERATE("xml2StartParse", set_skipto, NULL, OR_ALL,
+ "Ignore anything in front of the first of these elements"),
+ { NULL }
+};
+static void* xml2enc_config(apr_pool_t* pool, char* x)
+{
+ xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg));
+ ret->default_encoding = XML_CHAR_ENCODING_NONE ;
+ return ret;
+}
+
+static void* xml2enc_merge(apr_pool_t* pool, void* BASE, void* ADD)
+{
+ xml2cfg* base = BASE;
+ xml2cfg* add = ADD;
+ xml2cfg* ret = apr_pcalloc(pool, sizeof(xml2cfg));
+ ret->default_encoding = (add->default_encoding == XML_CHAR_ENCODING_NONE)
+ ? base->default_encoding : add->default_encoding ;
+ ret->default_charset = add->default_charset
+ ? add->default_charset : base->default_charset;
+ ret->skipto = add->skipto ? add->skipto : base->skipto;
+ return ret;
+}
+module AP_MODULE_DECLARE_DATA xml2enc_module = {
+ STANDARD20_MODULE_STUFF,
+ xml2enc_config,
+ xml2enc_merge,
+ NULL,
+ NULL,
+ xml2enc_cmds,
+ xml2enc_hooks
+};
+APR_IMPLEMENT_OPTIONAL_HOOK_RUN_ALL(xml2enc, XML2ENC, int, preprocess,
+ (ap_filter_t *f, char** bufp, apr_size_t* bytesp),
+ (f, bufp, bytesp), OK, DECLINED)