granicus.if.org Git - apache/blob - modules/filters/mod_charset_lite.c

   1 /* Copyright 2000-2006 The Apache Software Foundation or its licensors, as
   2  * applicable.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 /*
  18  * simple hokey charset recoding configuration module
  19  *
  20  * See mod_ebcdic and mod_charset for more thought-out examples.  This
  21  * one is just so Jeff can learn how a module works and experiment with
  22  * basic character set recoding configuration.
  23  *
  24  * !!!This is an extremely cheap ripoff of mod_charset.c from Russian Apache!!!
  25  */
  26
  27 #include "httpd.h"
  28 #include "http_config.h"
  29 #define CORE_PRIVATE
  30 #include "http_core.h"
  31 #include "http_log.h"
  32 #include "http_main.h"
  33 #include "http_protocol.h"
  34 #include "http_request.h"
  35 #include "util_charset.h"
  36 #include "apr_buckets.h"
  37 #include "util_filter.h"
  38 #include "apr_strings.h"
  39 #include "apr_lib.h"
  40 #include "apr_xlate.h"
  41 #define APR_WANT_STRFUNC
  42 #include "apr_want.h"
  43
  44 #define OUTPUT_XLATE_BUF_SIZE (16*1024) /* size of translation buffer used on output */
  45 #define INPUT_XLATE_BUF_SIZE  (8*1024)  /* size of translation buffer used on input */
  46
  47 #define XLATE_MIN_BUFF_LEFT 128  /* flush once there is no more than this much
  48                                   * space left in the translation buffer
  49                                   */
  50
  51 #define FATTEST_CHAR  8          /* we don't handle chars wider than this that straddle
  52                                   * two buckets
  53                                   */
  54
  55 /* extended error status codes; this is used in addition to an apr_status_t to
  56  * track errors in the translation filter
  57  */
  58 typedef enum {
  59     EES_INIT = 0,   /* no error info yet; value must be 0 for easy init */
  60     EES_LIMIT,      /* built-in restriction encountered */
  61     EES_INCOMPLETE_CHAR, /* incomplete multi-byte char at end of content */
  62     EES_BUCKET_READ,
  63     EES_DOWNSTREAM, /* something bad happened in a filter below xlate */
  64     EES_BAD_INPUT   /* input data invalid */
  65 } ees_t;
  66
  67 /* registered name of the output translation filter */
  68 #define XLATEOUT_FILTER_NAME "XLATEOUT"
  69 /* registered name of input translation filter */
  70 #define XLATEIN_FILTER_NAME  "XLATEIN"
  71
  72 typedef struct charset_dir_t {
  73     /** debug level; -1 means uninitialized, 0 means no debug */
  74     int debug;
  75     const char *charset_source; /* source encoding */
  76     const char *charset_default; /* how to ship on wire */
  77     /** module does ap_add_*_filter()? */
  78     enum {IA_INIT, IA_IMPADD, IA_NOIMPADD} implicit_add;
  79 } charset_dir_t;
  80
  81 /* charset_filter_ctx_t is created for each filter instance; because the same
  82  * filter code is used for translating in both directions, we need this context
  83  * data to tell the filter which translation handle to use; it also can hold a
  84  * character which was split between buckets
  85  */
  86 typedef struct charset_filter_ctx_t {
  87     apr_xlate_t *xlate;
  88     int is_sb;              /* single-byte translation? */
  89     charset_dir_t *dc;
  90     ees_t ees;              /* extended error status */
  91     apr_size_t saved;
  92     char buf[FATTEST_CHAR]; /* we want to be able to build a complete char here */
  93     int ran;                /* has filter instance run before? */
  94     int noop;               /* should we pass brigades through unchanged? */
  95     char *tmp;              /* buffer for input filtering */
  96     apr_bucket_brigade *bb; /* input buckets we couldn't finish translating */
  97 } charset_filter_ctx_t;
  98
  99 /* charset_req_t is available via r->request_config if any translation is
 100  * being performed
 101  */
 102 typedef struct charset_req_t {
 103     charset_dir_t *dc;
 104     charset_filter_ctx_t *output_ctx, *input_ctx;
 105 } charset_req_t;
 106
 107 /* debug level definitions */
 108 #define DBGLVL_GORY           9 /* gory details */
 109 #define DBGLVL_FLOW           4 /* enough messages to see what happens on
 110                                  * each request */
 111 #define DBGLVL_PMC            2 /* messages about possible misconfiguration */
 112
 113 module AP_MODULE_DECLARE_DATA charset_lite_module;
 114
 115 static void *create_charset_dir_conf(apr_pool_t *p,char *dummy)
 116 {
 117     charset_dir_t *dc = (charset_dir_t *)apr_pcalloc(p,sizeof(charset_dir_t));
 118
 119     dc->debug = -1;
 120     return dc;
 121 }
 122
 123 static void *merge_charset_dir_conf(apr_pool_t *p, void *basev, void *overridesv)
 124 {
 125     charset_dir_t *a = (charset_dir_t *)apr_pcalloc (p, sizeof(charset_dir_t));
 126     charset_dir_t *base = (charset_dir_t *)basev,
 127         *over = (charset_dir_t *)overridesv;
 128
 129     /* If it is defined in the current container, use it.  Otherwise, use the one
 130      * from the enclosing container.
 131      */
 132
 133     a->debug =
 134         over->debug != -1 ? over->debug : base->debug;
 135     a->charset_default =
 136         over->charset_default ? over->charset_default : base->charset_default;
 137     a->charset_source =
 138         over->charset_source ? over->charset_source : base->charset_source;
 139     a->implicit_add =
 140         over->implicit_add != IA_INIT ? over->implicit_add : base->implicit_add;
 141     return a;
 142 }
 143
 144 /* CharsetSourceEnc charset
 145  */
 146 static const char *add_charset_source(cmd_parms *cmd, void *in_dc,
 147                                       const char *name)
 148 {
 149     charset_dir_t *dc = in_dc;
 150
 151     dc->charset_source = name;
 152     return NULL;
 153 }
 154
 155 /* CharsetDefault charset
 156  */
 157 static const char *add_charset_default(cmd_parms *cmd, void *in_dc,
 158                                        const char *name)
 159 {
 160     charset_dir_t *dc = in_dc;
 161
 162     dc->charset_default = name;
 163     return NULL;
 164 }
 165
 166 /* CharsetOptions optionflag...
 167  */
 168 static const char *add_charset_options(cmd_parms *cmd, void *in_dc,
 169                                        const char *flag)
 170 {
 171     charset_dir_t *dc = in_dc;
 172
 173     if (!strcasecmp(flag, "ImplicitAdd")) {
 174         dc->implicit_add = IA_IMPADD;
 175     }
 176     else if (!strcasecmp(flag, "NoImplicitAdd")) {
 177         dc->implicit_add = IA_NOIMPADD;
 178     }
 179     else if (!strncasecmp(flag, "DebugLevel=", 11)) {
 180         dc->debug = atoi(flag + 11);
 181     }
 182     else {
 183         return apr_pstrcat(cmd->temp_pool,
 184                            "Invalid CharsetOptions option: ",
 185                            flag,
 186                            NULL);
 187     }
 188
 189     return NULL;
 190 }
 191
 192 /* find_code_page() is a fixup hook that decides if translation should be
 193  * enabled; if so, it sets up request data for use by the filter registration
 194  * hook so that it knows what to do
 195  */
 196 static int find_code_page(request_rec *r)
 197 {
 198     charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
 199                                              &charset_lite_module);
 200     charset_req_t *reqinfo;
 201     charset_filter_ctx_t *input_ctx, *output_ctx;
 202     apr_status_t rv;
 203     const char *mime_type;
 204
 205     if (dc->debug >= DBGLVL_FLOW) {
 206         ap_log_rerror(APLOG_MARK,APLOG_DEBUG, 0, r,
 207                       "uri: %s file: %s method: %d "
 208                       "imt: %s flags: %s%s%s %s->%s",
 209                       r->uri, r->filename, r->method_number,
 210                       r->content_type ? r->content_type : "(unknown)",
 211                       r->main     ? "S" : "",    /* S if subrequest */
 212                       r->prev     ? "R" : "",    /* R if redirect */
 213                       r->proxyreq ? "P" : "",    /* P if proxy */
 214                       dc->charset_source, dc->charset_default);
 215     }
 216
 217     /* If we don't have a full directory configuration, bail out.
 218      */
 219     if (!dc->charset_source || !dc->charset_default) {
 220         if (dc->debug >= DBGLVL_PMC) {
 221             ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
 222                           "incomplete configuration: src %s, dst %s",
 223                           dc->charset_source ? dc->charset_source : "unspecified",
 224                           dc->charset_default ? dc->charset_default : "unspecified");
 225         }
 226         return DECLINED;
 227     }
 228
 229     /* catch proxy requests */
 230     if (r->proxyreq) return DECLINED;
 231     /* mod_rewrite indicators */
 232     if (!strncmp(r->filename, "redirect:", 9)) return DECLINED;
 233     if (!strncmp(r->filename, "gone:", 5)) return DECLINED;
 234     if (!strncmp(r->filename, "passthrough:", 12)) return DECLINED;
 235     if (!strncmp(r->filename, "forbidden:", 10)) return DECLINED;
 236
 237     mime_type = r->content_type ? r->content_type : ap_default_type(r);
 238
 239     /* If mime type isn't text or message, bail out.
 240      */
 241
 242 /* XXX When we handle translation of the request body, watch out here as
 243  *     1.3 allowed additional mime types: multipart and
 244  *     application/x-www-form-urlencoded
 245  */
 246
 247     if (strncasecmp(mime_type, "text/", 5) &&
 248 #if APR_CHARSET_EBCDIC || AP_WANT_DIR_TRANSLATION
 249         /* On an EBCDIC machine, be willing to translate mod_autoindex-
 250          * generated output.  Otherwise, it doesn't look too cool.
 251          *
 252          * XXX This isn't a perfect fix because this doesn't trigger us
 253          * to convert from the charset of the source code to ASCII.  The
 254          * general solution seems to be to allow a generator to set an
 255          * indicator in the r specifying that the body is coded in the
 256          * implementation character set (i.e., the charset of the source
 257          * code).  This would get several different types of documents
 258          * translated properly: mod_autoindex output, mod_status output,
 259          * mod_info output, hard-coded error documents, etc.
 260          */
 261         strcmp(mime_type, DIR_MAGIC_TYPE) &&
 262 #endif
 263         strncasecmp(mime_type, "message/", 8)) {
 264         if (dc->debug >= DBGLVL_GORY) {
 265             ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
 266                           "mime type is %s; no translation selected",
 267                           mime_type);
 268         }
 269         /* We must not bail out here (i.e., the MIME test must be in the filter
 270          * itself, not in the fixup, because only then is the final MIME type known.
 271          * Examples for late changes to the MIME type include CGI handling (MIME
 272          * type is set in the Content-Type header produced by the CGI script), or
 273          * PHP (until PHP runs, the MIME type is set to application/x-httpd-php)
 274          */
 275     }
 276
 277     if (dc->debug >= DBGLVL_GORY) {
 278         ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
 279                       "charset_source: %s charset_default: %s",
 280                       dc && dc->charset_source ? dc->charset_source : "(none)",
 281                       dc && dc->charset_default ? dc->charset_default : "(none)");
 282     }
 283
 284     /* Get storage for the request data and the output filter context.
 285      * We rarely need the input filter context, so allocate that separately.
 286      */
 287     reqinfo = (charset_req_t *)apr_pcalloc(r->pool,
 288                                            sizeof(charset_req_t) +
 289                                            sizeof(charset_filter_ctx_t));
 290     output_ctx = (charset_filter_ctx_t *)(reqinfo + 1);
 291
 292     reqinfo->dc = dc;
 293     output_ctx->dc = dc;
 294     ap_set_module_config(r->request_config, &charset_lite_module, reqinfo);
 295
 296     reqinfo->output_ctx = output_ctx;
 297
 298     /* We must not open the xlation table here yet, because the final MIME
 299      * type is not known until we are actually called in the output filter.
 300      * With POST or PUT request, the case is different, because their MIME
 301      * type is set in the request headers, and their data are prerequisites
 302      * for actually calling, e.g., the CGI handler later on.
 303      */
 304     output_ctx->xlate = NULL;
 305
 306     switch (r->method_number) {
 307     case M_PUT:
 308     case M_POST:
 309         /* Set up input translation.  Note: A request body can be included
 310          * with the OPTIONS method, but for now we don't set up translation
 311          * of it.
 312          */
 313         input_ctx = apr_pcalloc(r->pool, sizeof(charset_filter_ctx_t));
 314         input_ctx->bb = apr_brigade_create(r->pool,
 315                                            r->connection->bucket_alloc);
 316         input_ctx->tmp = apr_palloc(r->pool, INPUT_XLATE_BUF_SIZE);
 317         input_ctx->dc = dc;
 318         reqinfo->input_ctx = input_ctx;
 319         rv = apr_xlate_open(&input_ctx->xlate, dc->charset_source,
 320                             dc->charset_default, r->pool);
 321         if (rv != APR_SUCCESS) {
 322             ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, r,
 323                           "can't open translation %s->%s",
 324                           dc->charset_default, dc->charset_source);
 325             return HTTP_INTERNAL_SERVER_ERROR;
 326         }
 327         if (apr_xlate_sb_get(input_ctx->xlate, &input_ctx->is_sb) != APR_SUCCESS) {
 328             input_ctx->is_sb = 0;
 329         }
 330     }
 331
 332     return DECLINED;
 333 }
 334
 335 static int configured_in_list(request_rec *r, const char *filter_name,
 336                               struct ap_filter_t *filter_list)
 337 {
 338     struct ap_filter_t *filter = filter_list;
 339
 340     while (filter) {
 341         if (!strcasecmp(filter_name, filter->frec->name)) {
 342             return 1;
 343         }
 344         filter = filter->next;
 345     }
 346     return 0;
 347 }
 348
 349 static int configured_on_input(request_rec *r, const char *filter_name)
 350 {
 351     return configured_in_list(r, filter_name, r->input_filters);
 352 }
 353
 354 static int configured_on_output(request_rec *r, const char *filter_name)
 355 {
 356     return configured_in_list(r, filter_name, r->output_filters);
 357 }
 358
 359 /* xlate_insert_filter() is a filter hook which decides whether or not
 360  * to insert a translation filter for the current request.
 361  */
 362 static void xlate_insert_filter(request_rec *r)
 363 {
 364     /* Hey... don't be so quick to use reqinfo->dc here; reqinfo may be NULL */
 365     charset_req_t *reqinfo = ap_get_module_config(r->request_config,
 366                                                   &charset_lite_module);
 367     charset_dir_t *dc = ap_get_module_config(r->per_dir_config,
 368                                              &charset_lite_module);
 369
 370     if (reqinfo) {
 371         if (reqinfo->output_ctx && !configured_on_output(r, XLATEOUT_FILTER_NAME)) {
 372             ap_add_output_filter(XLATEOUT_FILTER_NAME, reqinfo->output_ctx, r,
 373                                  r->connection);
 374         }
 375         else if (dc->debug >= DBGLVL_FLOW) {
 376             ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
 377                           "xlate output filter not added implicitly because %s",
 378                           !reqinfo->output_ctx ?
 379                           "no output configuration available" :
 380                           "another module added the filter");
 381         }
 382
 383         if (reqinfo->input_ctx && !configured_on_input(r, XLATEIN_FILTER_NAME)) {
 384             ap_add_input_filter(XLATEIN_FILTER_NAME, reqinfo->input_ctx, r,
 385                                 r->connection);
 386         }
 387         else if (dc->debug >= DBGLVL_FLOW) {
 388             ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, r,
 389                           "xlate input filter not added implicitly because %s",
 390                           !reqinfo->input_ctx ?
 391                           "no input configuration available" :
 392                           "another module added the filter");
 393         }
 394     }
 395 }
 396
 397 /* stuff that sucks that I know of:
 398  *
 399  * bucket handling:
 400  *  why create an eos bucket when we see it come down the stream?  just send the one
 401  *  passed as input...  news flash: this will be fixed when xlate_out_filter() starts
 402  *  using the more generic xlate_brigade()
 403  *
 404  * translation mechanics:
 405  *   we don't handle characters that straddle more than two buckets; an error
 406  *   will be generated
 407  */
 408
 409 /* send_downstream() is passed the translated data; it puts it in a single-
 410  * bucket brigade and passes the brigade to the next filter
 411  */
 412 static apr_status_t send_downstream(ap_filter_t *f, const char *tmp, apr_size_t len)
 413 {
 414     request_rec *r = f->r;
 415     conn_rec *c = r->connection;
 416     apr_bucket_brigade *bb;
 417     apr_bucket *b;
 418     charset_filter_ctx_t *ctx = f->ctx;
 419     apr_status_t rv;
 420
 421     bb = apr_brigade_create(r->pool, c->bucket_alloc);
 422     b = apr_bucket_transient_create(tmp, len, c->bucket_alloc);
 423     APR_BRIGADE_INSERT_TAIL(bb, b);
 424     rv = ap_pass_brigade(f->next, bb);
 425     if (rv != APR_SUCCESS) {
 426         ctx->ees = EES_DOWNSTREAM;
 427     }
 428     return rv;
 429 }
 430
 431 static apr_status_t send_eos(ap_filter_t *f)
 432 {
 433     request_rec *r = f->r;
 434     conn_rec *c = r->connection;
 435     apr_bucket_brigade *bb;
 436     apr_bucket *b;
 437     charset_filter_ctx_t *ctx = f->ctx;
 438     apr_status_t rv;
 439
 440     bb = apr_brigade_create(r->pool, c->bucket_alloc);
 441     b = apr_bucket_eos_create(c->bucket_alloc);
 442     APR_BRIGADE_INSERT_TAIL(bb, b);
 443     rv = ap_pass_brigade(f->next, bb);
 444     if (rv != APR_SUCCESS) {
 445         ctx->ees = EES_DOWNSTREAM;
 446     }
 447     return rv;
 448 }
 449
 450 static apr_status_t set_aside_partial_char(charset_filter_ctx_t *ctx,
 451                                            const char *partial,
 452                                            apr_size_t partial_len)
 453 {
 454     apr_status_t rv;
 455
 456     if (sizeof(ctx->buf) > partial_len) {
 457         ctx->saved = partial_len;
 458         memcpy(ctx->buf, partial, partial_len);
 459         rv = APR_SUCCESS;
 460     }
 461     else {
 462         rv = APR_INCOMPLETE;
 463         ctx->ees = EES_LIMIT; /* we don't handle chars this wide which straddle
 464                                * buckets
 465                                */
 466     }
 467     return rv;
 468 }
 469
 470 static apr_status_t finish_partial_char(charset_filter_ctx_t *ctx,
 471                                         /* input buffer: */
 472                                         const char **cur_str,
 473                                         apr_size_t *cur_len,
 474                                         /* output buffer: */
 475                                         char **out_str,
 476                                         apr_size_t *out_len)
 477 {
 478     apr_status_t rv;
 479     apr_size_t tmp_input_len;
 480
 481     /* Keep adding bytes from the input string to the saved string until we
 482      *    1) finish the input char
 483      *    2) get an error
 484      * or 3) run out of bytes to add
 485      */
 486
 487     do {
 488         ctx->buf[ctx->saved] = **cur_str;
 489         ++ctx->saved;
 490         ++*cur_str;
 491         --*cur_len;
 492         tmp_input_len = ctx->saved;
 493         rv = apr_xlate_conv_buffer(ctx->xlate,
 494                                    ctx->buf,
 495                                    &tmp_input_len,
 496                                    *out_str,
 497                                    out_len);
 498     } while (rv == APR_INCOMPLETE && *cur_len);
 499
 500     if (rv == APR_SUCCESS) {
 501         ctx->saved = 0;
 502     }
 503     else {
 504         ctx->ees = EES_LIMIT; /* code isn't smart enough to handle chars
 505                                * straddling more than two buckets
 506                                */
 507     }
 508
 509     return rv;
 510 }
 511
 512 static void log_xlate_error(ap_filter_t *f, apr_status_t rv)
 513 {
 514     charset_filter_ctx_t *ctx = f->ctx;
 515     const char *msg;
 516     char msgbuf[100];
 517     int cur;
 518
 519     switch(ctx->ees) {
 520     case EES_LIMIT:
 521         rv = 0;
 522         msg = "xlate filter - a built-in restriction was encountered";
 523         break;
 524     case EES_BAD_INPUT:
 525         rv = 0;
 526         msg = "xlate filter - an input character was invalid";
 527         break;
 528     case EES_BUCKET_READ:
 529         rv = 0;
 530         msg = "xlate filter - bucket read routine failed";
 531         break;
 532     case EES_INCOMPLETE_CHAR:
 533         rv = 0;
 534         strcpy(msgbuf, "xlate filter - incomplete char at end of input - ");
 535         cur = 0;
 536         while ((apr_size_t)cur < ctx->saved) {
 537             apr_snprintf(msgbuf + strlen(msgbuf), sizeof(msgbuf) - strlen(msgbuf),
 538                          "%02X", (unsigned)ctx->buf[cur]);
 539             ++cur;
 540         }
 541         msg = msgbuf;
 542         break;
 543     case EES_DOWNSTREAM:
 544         msg = "xlate filter - an error occurred in a lower filter";
 545         break;
 546     default:
 547         msg = "xlate filter - returning error";
 548     }
 549     ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r,
 550                   "%s", msg);
 551 }
 552
 553 /* chk_filter_chain() is called once per filter instance; it tries to
 554  * determine if the current filter instance should be disabled because
 555  * its translation is incompatible with the translation of an existing
 556  * instance of the translate filter
 557  *
 558  * Example bad scenario:
 559  *
 560  *   configured filter chain for the request:
 561  *     INCLUDES XLATEOUT(8859-1->UTS-16)
 562  *   configured filter chain for the subrequest:
 563  *     XLATEOUT(8859-1->UTS-16)
 564  *
 565  *   When the subrequest is processed, the filter chain will be
 566  *     XLATEOUT(8859-1->UTS-16) XLATEOUT(8859-1->UTS-16)
 567  *   This makes no sense, so the instance of XLATEOUT added for the
 568  *   subrequest will be noop-ed.
 569  *
 570  * Example good scenario:
 571  *
 572  *   configured filter chain for the request:
 573  *     INCLUDES XLATEOUT(8859-1->UTS-16)
 574  *   configured filter chain for the subrequest:
 575  *     XLATEOUT(IBM-1047->8859-1)
 576  *
 577  *   When the subrequest is processed, the filter chain will be
 578  *     XLATEOUT(IBM-1047->8859-1) XLATEOUT(8859-1->UTS-16)
 579  *   This makes sense, so the instance of XLATEOUT added for the
 580  *   subrequest will be left alone and it will translate from
 581  *   IBM-1047->8859-1.
 582  */
 583 static void chk_filter_chain(ap_filter_t *f)
 584 {
 585     ap_filter_t *curf;
 586     charset_filter_ctx_t *curctx, *last_xlate_ctx = NULL,
 587         *ctx = f->ctx;
 588     int debug = ctx->dc->debug;
 589     int output = !strcasecmp(f->frec->name, XLATEOUT_FILTER_NAME);
 590
 591     if (ctx->noop) {
 592         return;
 593     }
 594
 595     /* walk the filter chain; see if it makes sense for our filter to
 596      * do any translation
 597      */
 598     curf = output ? f->r->output_filters : f->r->input_filters;
 599     while (curf) {
 600         if (!strcasecmp(curf->frec->name, f->frec->name) &&
 601             curf->ctx) {
 602             curctx = (charset_filter_ctx_t *)curf->ctx;
 603             if (!last_xlate_ctx) {
 604                 last_xlate_ctx = curctx;
 605             }
 606             else {
 607                 if (strcmp(last_xlate_ctx->dc->charset_default,
 608                            curctx->dc->charset_source)) {
 609                     /* incompatible translation
 610                      * if our filter instance is incompatible with an instance
 611                      * already in place, noop our instance
 612                      * Notes:
 613                      * . We are only willing to noop our own instance.
 614                      * . It is possible to noop another instance which has not
 615                      *   yet run, but this is not currently implemented.
 616                      *   Hopefully it will not be needed.
 617                      * . It is not possible to noop an instance which has
 618                      *   already run.
 619                      */
 620                     if (last_xlate_ctx == f->ctx) {
 621                         last_xlate_ctx->noop = 1;
 622                         if (debug >= DBGLVL_PMC) {
 623                             const char *symbol = output ? "->" : "<-";
 624
 625                             ap_log_rerror(APLOG_MARK, APLOG_DEBUG,
 626                                           0, f->r,
 627                                           "%s %s - disabling "
 628                                           "translation %s%s%s; existing "
 629                                           "translation %s%s%s",
 630                                           f->r->uri ? "uri" : "file",
 631                                           f->r->uri ? f->r->uri : f->r->filename,
 632                                           last_xlate_ctx->dc->charset_source,
 633                                           symbol,
 634                                           last_xlate_ctx->dc->charset_default,
 635                                           curctx->dc->charset_source,
 636                                           symbol,
 637                                           curctx->dc->charset_default);
 638                         }
 639                     }
 640                     else {
 641                         const char *symbol = output ? "->" : "<-";
 642
 643                         ap_log_rerror(APLOG_MARK, APLOG_ERR,
 644                                       0, f->r,
 645                                       "chk_filter_chain() - can't disable "
 646                                       "translation %s%s%s; existing "
 647                                       "translation %s%s%s",
 648                                       last_xlate_ctx->dc->charset_source,
 649                                       symbol,
 650                                       last_xlate_ctx->dc->charset_default,
 651                                       curctx->dc->charset_source,
 652                                       symbol,
 653                                       curctx->dc->charset_default);
 654                     }
 655                     break;
 656                 }
 657             }
 658         }
 659         curf = curf->next;
 660     }
 661 }
 662
 663 /* xlate_brigade() is used to filter request and response bodies
 664  *
 665  * we'll stop when one of the following occurs:
 666  * . we run out of buckets
 667  * . we run out of space in the output buffer
 668  * . we hit an error
 669  *
 670  * inputs:
 671  *   bb:               brigade to process
 672  *   buffer:           storage to hold the translated characters
 673  *   buffer_size:      size of buffer
 674  *   (and a few more uninteresting parms)
 675  *
 676  * outputs:
 677  *   return value:     APR_SUCCESS or some error code
 678  *   bb:               we've removed any buckets representing the
 679  *                     translated characters; the eos bucket, if
 680  *                     present, will be left in the brigade
 681  *   buffer:           filled in with translated characters
 682  *   buffer_size:      updated with the bytes remaining
 683  *   hit_eos:          did we hit an EOS bucket?
 684  */
 685 static apr_status_t xlate_brigade(charset_filter_ctx_t *ctx,
 686                                   apr_bucket_brigade *bb,
 687                                   char *buffer,
 688                                   apr_size_t *buffer_avail,
 689                                   int *hit_eos)
 690 {
 691     apr_bucket *b = NULL; /* set to NULL only to quiet some gcc */
 692     apr_bucket *consumed_bucket;
 693     const char *bucket;
 694     apr_size_t bytes_in_bucket; /* total bytes read from current bucket */
 695     apr_size_t bucket_avail;    /* bytes left in current bucket */
 696     apr_status_t rv = APR_SUCCESS;
 697
 698     *hit_eos = 0;
 699     bucket_avail = 0;
 700     consumed_bucket = NULL;
 701     while (1) {
 702         if (!bucket_avail) { /* no bytes left to process in the current bucket... */
 703             if (consumed_bucket) {
 704                 apr_bucket_delete(consumed_bucket);
 705                 consumed_bucket = NULL;
 706             }
 707             b = APR_BRIGADE_FIRST(bb);
 708             if (b == APR_BRIGADE_SENTINEL(bb) ||
 709                 APR_BUCKET_IS_EOS(b)) {
 710                 break;
 711             }
 712             rv = apr_bucket_read(b, &bucket, &bytes_in_bucket, APR_BLOCK_READ);
 713             if (rv != APR_SUCCESS) {
 714                 ctx->ees = EES_BUCKET_READ;
 715                 break;
 716             }
 717             bucket_avail = bytes_in_bucket;
 718             consumed_bucket = b;   /* for axing when we're done reading it */
 719         }
 720         if (bucket_avail) {
 721             /* We've got data, so translate it. */
 722             if (ctx->saved) {
 723                 /* Rats... we need to finish a partial character from the previous
 724                  * bucket.
 725                  *
 726                  * Strangely, finish_partial_char() increments the input buffer
 727                  * pointer but does not increment the output buffer pointer.
 728                  */
 729                 apr_size_t old_buffer_avail = *buffer_avail;
 730                 rv = finish_partial_char(ctx,
 731                                          &bucket, &bucket_avail,
 732                                          &buffer, buffer_avail);
 733                 buffer += old_buffer_avail - *buffer_avail;
 734             }
 735             else {
 736                 apr_size_t old_buffer_avail = *buffer_avail;
 737                 apr_size_t old_bucket_avail = bucket_avail;
 738                 rv = apr_xlate_conv_buffer(ctx->xlate,
 739                                            bucket, &bucket_avail,
 740                                            buffer,
 741                                            buffer_avail);
 742                 buffer  += old_buffer_avail - *buffer_avail;
 743                 bucket  += old_bucket_avail - bucket_avail;
 744
 745                 if (rv == APR_INCOMPLETE) { /* partial character at end of input */
 746                     /* We need to save the final byte(s) for next time; we can't
 747                      * convert it until we look at the next bucket.
 748                      */
 749                     rv = set_aside_partial_char(ctx, bucket, bucket_avail);
 750                     bucket_avail = 0;
 751                 }
 752             }
 753             if (rv != APR_SUCCESS) {
 754                 /* bad input byte or partial char too big to store */
 755                 break;
 756             }
 757             if (*buffer_avail < XLATE_MIN_BUFF_LEFT) {
 758                 /* if any data remains in the current bucket, split there */
 759                 if (bucket_avail) {
 760                     apr_bucket_split(b, bytes_in_bucket - bucket_avail);
 761                 }
 762                 apr_bucket_delete(b);
 763                 break;
 764             }
 765         }
 766     }
 767
 768     if (!APR_BRIGADE_EMPTY(bb)) {
 769         b = APR_BRIGADE_FIRST(bb);
 770         if (APR_BUCKET_IS_EOS(b)) {
 771             /* Leave the eos bucket in the brigade for reporting to
 772              * subsequent filters.
 773              */
 774             *hit_eos = 1;
 775             if (ctx->saved) {
 776                 /* Oops... we have a partial char from the previous bucket
 777                  * that won't be completed because there's no more data.
 778                  */
 779                 rv = APR_INCOMPLETE;
 780                 ctx->ees = EES_INCOMPLETE_CHAR;
 781             }
 782         }
 783     }
 784
 785     return rv;
 786 }
 787
 788 /* xlate_out_filter() handles (almost) arbitrary conversions from one charset
 789  * to another...
 790  * translation is determined in the fixup hook (find_code_page), which is
 791  * where the filter's context data is set up... the context data gives us
 792  * the translation handle
 793  */
 794 static apr_status_t xlate_out_filter(ap_filter_t *f, apr_bucket_brigade *bb)
 795 {
 796     charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
 797                                                   &charset_lite_module);
 798     charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
 799                                              &charset_lite_module);
 800     charset_filter_ctx_t *ctx = f->ctx;
 801     apr_bucket *dptr, *consumed_bucket;
 802     const char *cur_str;
 803     apr_size_t cur_len, cur_avail;
 804     char tmp[OUTPUT_XLATE_BUF_SIZE];
 805     apr_size_t space_avail;
 806     int done;
 807     apr_status_t rv = APR_SUCCESS;
 808
 809     if (!ctx) {
 810         /* this is SetOutputFilter path; grab the preallocated context,
 811          * if any; note that if we decided not to do anything in an earlier
 812          * handler, we won't even have a reqinfo
 813          */
 814         if (reqinfo) {
 815             ctx = f->ctx = reqinfo->output_ctx;
 816             reqinfo->output_ctx = NULL; /* prevent SNAFU if user coded us twice
 817                                          * in the filter chain; we can't have two
 818                                          * instances using the same context
 819                                          */
 820         }
 821         if (!ctx) {                   /* no idea how to translate; don't do anything */
 822             ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
 823             ctx->dc = dc;
 824             ctx->noop = 1;
 825         }
 826     }
 827
 828     /* Opening the output translation (this used to be done in the fixup hook,
 829      * but that was too early: a subsequent type modification, e.g., by a
 830      * CGI script, would go unnoticed. Now we do it in the filter itself.)
 831      */
 832     if (!ctx->noop && ctx->xlate == NULL)
 833     {
 834         const char *mime_type = f->r->content_type ? f->r->content_type : ap_default_type(f->r);
 835
 836         /* XXX When we handle translation of the request body, watch out here as
 837          *     1.3 allowed additional mime types: multipart and
 838          *     application/x-www-form-urlencoded
 839          */
 840         if (strncasecmp(mime_type, "text/", 5) == 0 ||
 841 #if APR_CHARSET_EBCDIC
 842         /* On an EBCDIC machine, be willing to translate mod_autoindex-
 843          * generated output.  Otherwise, it doesn't look too cool.
 844          *
 845          * XXX This isn't a perfect fix because this doesn't trigger us
 846          * to convert from the charset of the source code to ASCII.  The
 847          * general solution seems to be to allow a generator to set an
 848          * indicator in the r specifying that the body is coded in the
 849          * implementation character set (i.e., the charset of the source
 850          * code).  This would get several different types of documents
 851          * translated properly: mod_autoindex output, mod_status output,
 852          * mod_info output, hard-coded error documents, etc.
 853          */
 854         strcmp(mime_type, DIR_MAGIC_TYPE) == 0 ||
 855 #endif
 856         strncasecmp(mime_type, "message/", 8) == 0) {
 857
 858             rv = apr_xlate_open(&ctx->xlate,
 859                         dc->charset_default, dc->charset_source, f->r->pool);
 860             if (rv != APR_SUCCESS) {
 861                 ap_log_rerror(APLOG_MARK, APLOG_ERR, rv, f->r,
 862                               "can't open translation %s->%s",
 863                               dc->charset_source, dc->charset_default);
 864                 ctx->noop = 1;
 865             }
 866             else {
 867                 if (apr_xlate_sb_get(ctx->xlate, &ctx->is_sb) != APR_SUCCESS) {
 868                     ctx->is_sb = 0;
 869                 }
 870             }
 871         }
 872         else {
 873                 ctx->noop = 1;
 874                 if (dc->debug >= DBGLVL_GORY)
 875                     ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
 876                                   "mime type is %s; no translation selected",
 877                                   mime_type);
 878             }
 879     }
 880
 881     if (dc->debug >= DBGLVL_GORY) {
 882         ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
 883                      "xlate_out_filter() - "
 884                      "charset_source: %s charset_default: %s",
 885                      dc && dc->charset_source ? dc->charset_source : "(none)",
 886                      dc && dc->charset_default ? dc->charset_default : "(none)");
 887     }
 888
 889     if (!ctx->ran) {  /* filter never ran before */
 890         chk_filter_chain(f);
 891         ctx->ran = 1;
 892         if (!ctx->noop && !ctx->is_sb) {
 893             /* We're not converting between two single-byte charsets, so unset
 894              * Content-Length since it is unlikely to remain the same.
 895              */
 896             apr_table_unset(f->r->headers_out, "Content-Length");
 897         }
 898     }
 899
 900     if (ctx->noop) {
 901         return ap_pass_brigade(f->next, bb);
 902     }
 903
 904     dptr = APR_BRIGADE_FIRST(bb);
 905     done = 0;
 906     cur_len = 0;
 907     space_avail = sizeof(tmp);
 908     consumed_bucket = NULL;
 909     while (!done) {
 910         if (!cur_len) { /* no bytes left to process in the current bucket... */
 911             if (consumed_bucket) {
 912                 apr_bucket_delete(consumed_bucket);
 913                 consumed_bucket = NULL;
 914             }
 915             if (dptr == APR_BRIGADE_SENTINEL(bb)) {
 916                 done = 1;
 917                 break;
 918             }
 919             if (APR_BUCKET_IS_EOS(dptr)) {
 920                 done = 1;
 921                 cur_len = -1; /* XXX yuck, but that tells us to send
 922                                  * eos down; when we minimize our bb construction
 923                                  * we'll fix this crap */
 924                 if (ctx->saved) {
 925                     /* Oops... we have a partial char from the previous bucket
 926                      * that won't be completed because there's no more data.
 927                      */
 928                     rv = APR_INCOMPLETE;
 929                     ctx->ees = EES_INCOMPLETE_CHAR;
 930                 }
 931                 break;
 932             }
 933             rv = apr_bucket_read(dptr, &cur_str, &cur_len, APR_BLOCK_READ);
 934             if (rv != APR_SUCCESS) {
 935                 done = 1;
 936                 ctx->ees = EES_BUCKET_READ;
 937                 break;
 938             }
 939             consumed_bucket = dptr; /* for axing when we're done reading it */
 940             dptr = APR_BUCKET_NEXT(dptr); /* get ready for when we access the
 941                                           * next bucket */
 942         }
 943         /* Try to fill up our tmp buffer with translated data. */
 944         cur_avail = cur_len;
 945
 946         if (cur_len) { /* maybe we just hit the end of a pipe (len = 0) ? */
 947             if (ctx->saved) {
 948                 /* Rats... we need to finish a partial character from the previous
 949                  * bucket.
 950                  */
 951                 char *tmp_tmp;
 952
 953                 tmp_tmp = tmp + sizeof(tmp) - space_avail;
 954                 rv = finish_partial_char(ctx,
 955                                          &cur_str, &cur_len,
 956                                          &tmp_tmp, &space_avail);
 957             }
 958             else {
 959                 rv = apr_xlate_conv_buffer(ctx->xlate,
 960                                            cur_str, &cur_avail,
 961                                            tmp + sizeof(tmp) - space_avail, &space_avail);
 962
 963                 /* Update input ptr and len after consuming some bytes */
 964                 cur_str += cur_len - cur_avail;
 965                 cur_len = cur_avail;
 966
 967                 if (rv == APR_INCOMPLETE) { /* partial character at end of input */
 968                     /* We need to save the final byte(s) for next time; we can't
 969                      * convert it until we look at the next bucket.
 970                      */
 971                     rv = set_aside_partial_char(ctx, cur_str, cur_len);
 972                     cur_len = 0;
 973                 }
 974             }
 975         }
 976
 977         if (rv != APR_SUCCESS) {
 978             /* bad input byte or partial char too big to store */
 979             done = 1;
 980         }
 981
 982         if (space_avail < XLATE_MIN_BUFF_LEFT) {
 983             /* It is time to flush, as there is not enough space left in the
 984              * current output buffer to bother with converting more data.
 985              */
 986             rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
 987             if (rv != APR_SUCCESS) {
 988                 done = 1;
 989             }
 990
 991             /* tmp is now empty */
 992             space_avail = sizeof(tmp);
 993         }
 994     }
 995
 996     if (rv == APR_SUCCESS) {
 997         if (space_avail < sizeof(tmp)) { /* gotta write out what we converted */
 998             rv = send_downstream(f, tmp, sizeof(tmp) - space_avail);
 999         }
1000     }
1001     if (rv == APR_SUCCESS) {
1002         if (cur_len == -1) {
1003             rv = send_eos(f);
1004         }
1005     }
1006     else {
1007         log_xlate_error(f, rv);
1008     }
1009
1010     return rv;
1011 }
1012
1013 static int xlate_in_filter(ap_filter_t *f, apr_bucket_brigade *bb,
1014                            ap_input_mode_t mode, apr_read_type_e block,
1015                            apr_off_t readbytes)
1016 {
1017     apr_status_t rv;
1018     charset_req_t *reqinfo = ap_get_module_config(f->r->request_config,
1019                                                   &charset_lite_module);
1020     charset_dir_t *dc = ap_get_module_config(f->r->per_dir_config,
1021                                              &charset_lite_module);
1022     charset_filter_ctx_t *ctx = f->ctx;
1023     apr_size_t buffer_size;
1024     int hit_eos;
1025
1026     if (!ctx) {
1027         /* this is SetInputFilter path; grab the preallocated context,
1028          * if any; note that if we decided not to do anything in an earlier
1029          * handler, we won't even have a reqinfo
1030          */
1031         if (reqinfo) {
1032             ctx = f->ctx = reqinfo->input_ctx;
1033             reqinfo->input_ctx = NULL; /* prevent SNAFU if user coded us twice
1034                                         * in the filter chain; we can't have two
1035                                         * instances using the same context
1036                                         */
1037         }
1038         if (!ctx) {                   /* no idea how to translate; don't do anything */
1039             ctx = f->ctx = apr_pcalloc(f->r->pool, sizeof(charset_filter_ctx_t));
1040             ctx->dc = dc;
1041             ctx->noop = 1;
1042         }
1043     }
1044
1045     if (dc->debug >= DBGLVL_GORY) {
1046         ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
1047                      "xlate_in_filter() - "
1048                      "charset_source: %s charset_default: %s",
1049                      dc && dc->charset_source ? dc->charset_source : "(none)",
1050                      dc && dc->charset_default ? dc->charset_default : "(none)");
1051     }
1052
1053     if (!ctx->ran) {  /* filter never ran before */
1054         chk_filter_chain(f);
1055         ctx->ran = 1;
1056         if (!ctx->noop && !ctx->is_sb) {
1057             /* We're not converting between two single-byte charsets, so note
1058              * that some handlers can't deal with it.
1059              * It doesn't help to unset Content-Length in the input header
1060              * table since in all likelihood the handler has already seen it.
1061              */
1062             if (dc->debug >= DBGLVL_PMC) {
1063                 ap_log_rerror(APLOG_MARK, APLOG_DEBUG, 0, f->r,
1064                               "Request body length may change, breaking some requests");
1065             }
1066         }
1067     }
1068
1069     if (ctx->noop) {
1070         return ap_get_brigade(f->next, bb, mode, block, readbytes);
1071     }
1072
1073     if (APR_BRIGADE_EMPTY(ctx->bb)) {
1074         if ((rv = ap_get_brigade(f->next, bb, mode, block,
1075                                  readbytes)) != APR_SUCCESS) {
1076             return rv;
1077         }
1078     }
1079     else {
1080         APR_BRIGADE_PREPEND(bb, ctx->bb); /* first use the leftovers */
1081     }
1082
1083     buffer_size = INPUT_XLATE_BUF_SIZE;
1084     rv = xlate_brigade(ctx, bb, ctx->tmp, &buffer_size, &hit_eos);
1085     if (rv == APR_SUCCESS) {
1086         if (!hit_eos) {
1087             /* move anything leftover into our context for next time;
1088              * we don't currently "set aside" since the data came from
1089              * down below, but I suspect that for long-term we need to
1090              * do that
1091              */
1092             APR_BRIGADE_CONCAT(ctx->bb, bb);
1093         }
1094         if (buffer_size < INPUT_XLATE_BUF_SIZE) { /* do we have output? */
1095             apr_bucket *e;
1096
1097             e = apr_bucket_heap_create(ctx->tmp,
1098                                        INPUT_XLATE_BUF_SIZE - buffer_size,
1099                                        NULL, f->r->connection->bucket_alloc);
1100             /* make sure we insert at the head, because there may be
1101              * an eos bucket already there, and the eos bucket should
1102              * come after the data
1103              */
1104             APR_BRIGADE_INSERT_HEAD(bb, e);
1105         }
1106         else {
1107             /* XXX need to get some more data... what if the last brigade
1108              * we got had only the first byte of a multibyte char?  we need
1109              * to grab more data from the network instead of returning an
1110              * empty brigade
1111              */
1112         }
1113     }
1114     else {
1115         log_xlate_error(f, rv);
1116     }
1117
1118     return rv;
1119 }
1120
1121 static const command_rec cmds[] =
1122 {
1123     AP_INIT_TAKE1("CharsetSourceEnc",
1124                   add_charset_source,
1125                   NULL,
1126                   OR_FILEINFO,
1127                   "source (html,cgi,ssi) file charset"),
1128     AP_INIT_TAKE1("CharsetDefault",
1129                   add_charset_default,
1130                   NULL,
1131                   OR_FILEINFO,
1132                   "name of default charset"),
1133     AP_INIT_ITERATE("CharsetOptions",
1134                     add_charset_options,
1135                     NULL,
1136                     OR_FILEINFO,
1137                     "valid options: ImplicitAdd, NoImplicitAdd, DebugLevel=n"),
1138     {NULL}
1139 };
1140
1141 static void charset_register_hooks(apr_pool_t *p)
1142 {
1143     ap_hook_fixups(find_code_page, NULL, NULL, APR_HOOK_MIDDLE);
1144     ap_hook_insert_filter(xlate_insert_filter, NULL, NULL, APR_HOOK_REALLY_LAST);
1145     ap_register_output_filter(XLATEOUT_FILTER_NAME, xlate_out_filter, NULL,
1146                               AP_FTYPE_RESOURCE);
1147     ap_register_input_filter(XLATEIN_FILTER_NAME, xlate_in_filter, NULL,
1148                              AP_FTYPE_RESOURCE);
1149 }
1150
1151 module AP_MODULE_DECLARE_DATA charset_lite_module =
1152 {
1153     STANDARD20_MODULE_STUFF,
1154     create_charset_dir_conf,
1155     merge_charset_dir_conf,
1156     NULL,
1157     NULL,
1158     cmds,
1159     charset_register_hooks
1160 };
1161