1 /* ====================================================================
2 * The Apache Software License, Version 1.1
4 * Copyright (c) 2000-2001 The Apache Software Foundation. All rights
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
19 * 3. The end-user documentation included with the redistribution,
20 * if any, must include the following acknowledgment:
21 * "This product includes software developed by the
22 * Apache Software Foundation (http://www.apache.org/)."
23 * Alternately, this acknowledgment may appear in the software itself,
24 * if and wherever such third-party acknowledgments normally appear.
26 * 4. The names "Apache" and "Apache Software Foundation" must
27 * not be used to endorse or promote products derived from this
28 * software without prior written permission. For written
29 * permission, please contact apache@apache.org.
31 * 5. Products derived from this software may not be called "Apache",
32 * nor may "Apache" appear in their name, without prior written
33 * permission of the Apache Software Foundation.
35 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
36 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
37 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
38 * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
41 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
42 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
43 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
44 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
45 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
47 * ====================================================================
49 * This software consists of voluntary contributions made by many
50 * individuals on behalf of the Apache Software Foundation. For more
51 * information on the Apache Software Foundation, please see
52 * <http://www.apache.org/>.
54 * Portions of this software are based upon public domain software
55 * originally written at the National Center for Supercomputing Applications,
56 * University of Illinois, Urbana-Champaign.
60 #include "apr_file_io.h"
61 #include "apr_strings.h"
64 #define APR_WANT_STRFUNC
67 #define WANT_BASENAME_MATCH
70 #include "http_core.h"
71 #include "http_config.h"
72 #include "http_request.h"
75 /* mod_speling.c - by Alexei Kosut <akosut@organic.com> June, 1996
77 * This module is transparent, and simple. It attempts to correct
78 * misspellings of URLs that users might have entered, namely by checking
79 * capitalizations. If it finds a match, it sends a redirect.
81 * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
82 * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in
84 * o Integrated tcsh's "spelling correction" routine which allows one
85 * misspelling (character insertion/omission/typo/transposition).
86 * Rewrote it to ignore case as well. This ought to catch the majority
87 * of misspelled requests.
88 * o Commented out the second pass where files' suffixes are stripped.
89 * Given the better hit rate of the first pass, this rather ugly
90 * (request index.html, receive index.db ?!?!) solution can be
92 * o wrote a "kind of" html page for mod_speling
94 * Activate it with "CheckSpelling On"
97 AP_MODULE_DECLARE_DATA module speling_module;
104 * Create a configuration specific to this module for a server or directory
105 * location, and fill it with the default settings.
107 * The API says that in the absence of a merge function, the record for the
108 * closest ancestor is used exclusively. That's what we want, so we don't
109 * bother to have such a function.
112 static void *mkconfig(apr_pool_t *p)
114 spconfig *cfg = apr_pcalloc(p, sizeof(spconfig));
121 * Respond to a callback to create configuration record for a server or
124 static void *create_mconfig_for_server(apr_pool_t *p, server_rec *s)
130 * Respond to a callback to create a config record for a specific directory.
132 static void *create_mconfig_for_directory(apr_pool_t *p, char *dir)
138 * Handler for the CheckSpelling directive, which is FLAG.
140 static const char *set_speling(cmd_parms *cmd, void *mconfig, int arg)
142 spconfig *cfg = (spconfig *) mconfig;
149 * Define the directives specific to this module. This structure is referenced
150 * later by the 'module' structure.
152 static const command_rec speling_cmds[] =
154 AP_INIT_FLAG("CheckSpelling", set_speling, NULL, OR_OPTIONS,
155 "whether or not to fix miscapitalized/misspelled requests"),
161 SP_MISCAPITALIZED = 1,
162 SP_TRANSPOSITION = 2,
169 static const char *sp_reason_str[] =
173 "transposed characters",
176 "mistyped character",
186 * spdist() is taken from Kernighan & Pike,
187 * _The_UNIX_Programming_Environment_
188 * and adapted somewhat to correspond better to psychological reality.
189 * (Note the changes to the return values)
191 * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4),
192 * page 363, the correct order for this is:
193 * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION
194 * thus, it was exactly backwards in the old version. -- PWP
196 * This routine was taken out of tcsh's spelling correction code
197 * (tcsh-6.07.04) and re-converted to apache data types ("char" type
198 * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case
199 * during comparisons, so is a "approximate strcasecmp()".
200 * NOTE that is still allows only _one_ real "typo",
201 * it does NOT try to correct multiple errors.
204 static sp_reason spdist(const char *s, const char *t)
206 for (; apr_tolower(*s) == apr_tolower(*t); t++, s++) {
208 return SP_MISCAPITALIZED; /* exact match (sans case) */
213 if (s[1] && t[1] && apr_tolower(*s) == apr_tolower(t[1])
214 && apr_tolower(*t) == apr_tolower(s[1])
215 && strcasecmp(s + 2, t + 2) == 0) {
216 return SP_TRANSPOSITION; /* transposition */
218 if (strcasecmp(s + 1, t + 1) == 0) {
219 return SP_SIMPLETYPO; /* 1 char mismatch */
222 if (strcasecmp(s + 1, t) == 0) {
223 return SP_EXTRACHAR; /* extra character */
226 if (*t && strcasecmp(s, t + 1) == 0) {
227 return SP_MISSINGCHAR; /* missing character */
229 return SP_VERYDIFFERENT; /* distance too large to fix. */
232 static int sort_by_quality(const void *left, const void *rite)
234 return (int) (((misspelled_file *) left)->quality)
235 - (int) (((misspelled_file *) rite)->quality);
238 static int check_speling(request_rec *r)
241 char *good, *bad, *postgood, *url;
243 int filoc, dotloc, urlen, pglen;
244 apr_array_header_t *candidates = NULL;
247 cfg = ap_get_module_config(r->per_dir_config, &speling_module);
252 /* We only want to worry about GETs */
253 if (r->method_number != M_GET) {
257 /* We've already got a file of some kind or another */
258 if (r->proxyreq || (r->finfo.filetype != 0)) {
262 /* This is a sub request - don't mess with it */
268 * The request should end up looking like this:
269 * r->uri: /correct-url/mispelling/more
270 * r->filename: /correct-file/mispelling r->path_info: /more
272 * So we do this in steps. First break r->filename into two pieces
275 filoc = ap_rind(r->filename, '/');
277 * Don't do anything if the request doesn't contain a slash, or
280 if (filoc == -1 || strcmp(r->uri, "/") == 0) {
284 /* good = /correct-file */
285 good = apr_pstrndup(r->pool, r->filename, filoc);
286 /* bad = mispelling */
287 bad = apr_pstrdup(r->pool, r->filename + filoc + 1);
288 /* postgood = mispelling/more */
289 postgood = apr_pstrcat(r->pool, bad, r->path_info, NULL);
291 urlen = strlen(r->uri);
292 pglen = strlen(postgood);
294 /* Check to see if the URL pieces add up */
295 if (strcmp(postgood, r->uri + (urlen - pglen))) {
299 /* url = /correct-url */
300 url = apr_pstrndup(r->pool, r->uri, (urlen - pglen));
302 /* Now open the directory and do ourselves a check... */
303 if (apr_dir_open(&dir, good, r->pool) != APR_SUCCESS) {
304 /* Oops, not a directory... */
308 candidates = apr_array_make(r->pool, 2, sizeof(misspelled_file));
310 dotloc = ap_ind(bad, '.');
312 dotloc = strlen(bad);
315 while (apr_dir_read(&dirent, APR_FINFO_DIRENT, dir) == APR_SUCCESS) {
319 * If we end up with a "fixed" URL which is identical to the
320 * requested one, we must have found a broken symlink or some such.
321 * Do _not_ try to redirect this, it causes a loop!
323 if (strcmp(bad, dirent.name) == 0) {
329 * miscapitalization errors are checked first (like, e.g., lower case
330 * file, upper case request)
332 else if (strcasecmp(bad, dirent.name) == 0) {
333 misspelled_file *sp_new;
335 sp_new = (misspelled_file *) apr_array_push(candidates);
336 sp_new->name = apr_pstrdup(r->pool, dirent.name);
337 sp_new->quality = SP_MISCAPITALIZED;
341 * simple typing errors are checked next (like, e.g.,
342 * missing/extra/transposed char)
344 else if ((q = spdist(bad, dirent.name)) != SP_VERYDIFFERENT) {
345 misspelled_file *sp_new;
347 sp_new = (misspelled_file *) apr_array_push(candidates);
348 sp_new->name = apr_pstrdup(r->pool, dirent.name);
353 * The spdist() should have found the majority of the misspelled
354 * requests. It is of questionable use to continue looking for
355 * files with the same base name, but potentially of totally wrong
356 * type (index.html <-> index.db).
357 * I would propose to not set the WANT_BASENAME_MATCH define.
358 * 08-Aug-1997 <Martin.Kraemer@Mch.SNI.De>
360 * However, Alexei replied giving some reasons to add it anyway:
361 * > Oh, by the way, I remembered why having the
362 * > extension-stripping-and-matching stuff is a good idea:
364 * > If you're using MultiViews, and have a file named foobar.html,
365 * > which you refer to as "foobar", and someone tried to access
366 * > "Foobar", mod_speling won't find it, because it won't find
367 * > anything matching that spelling. With the extension-munging,
368 * > it would locate "foobar.html". Not perfect, but I ran into
369 * > that problem when I first wrote the module.
372 #ifdef WANT_BASENAME_MATCH
374 * Okay... we didn't find anything. Now we take out the hard-core
375 * power tools. There are several cases here. Someone might have
376 * entered a wrong extension (.htm instead of .html or vice
377 * versa) or the document could be negotiated. At any rate, now
378 * we just compare stuff before the first dot. If it matches, we
379 * figure we got us a match. This can result in wrong things if
380 * there are files of different content types but the same prefix
381 * (e.g. foo.gif and foo.html) This code will pick the first one
382 * it finds. Better than a Not Found, though.
384 int entloc = ap_ind(dirent.name, '.');
386 entloc = strlen(dirent.name);
389 if ((dotloc == entloc)
390 && !strncasecmp(bad, dirent.name, dotloc)) {
391 misspelled_file *sp_new;
393 sp_new = (misspelled_file *) apr_array_push(candidates);
394 sp_new->name = apr_pstrdup(r->pool, dirent.name);
395 sp_new->quality = SP_VERYDIFFERENT;
402 if (candidates->nelts != 0) {
403 /* Wow... we found us a mispelling. Construct a fixed url */
406 misspelled_file *variant = (misspelled_file *) candidates->elts;
409 ref = apr_table_get(r->headers_in, "Referer");
411 qsort((void *) candidates->elts, candidates->nelts,
412 sizeof(misspelled_file), sort_by_quality);
415 * Conditions for immediate redirection:
416 * a) the first candidate was not found by stripping the suffix
417 * AND b) there exists only one candidate OR the best match is not
419 * then return a redirection right away.
421 if (variant[0].quality != SP_VERYDIFFERENT
422 && (candidates->nelts == 1
423 || variant[0].quality != variant[1].quality)) {
425 nuri = ap_escape_uri(r->pool, apr_pstrcat(r->pool, url,
427 r->path_info, NULL));
428 if (r->parsed_uri.query)
429 nuri = apr_pstrcat(r->pool, nuri, "?", r->parsed_uri.query, NULL);
431 apr_table_setn(r->headers_out, "Location",
432 ap_construct_url(r->pool, nuri, r));
434 ap_log_rerror(APLOG_MARK, APLOG_NOERRNO | APLOG_INFO, APR_SUCCESS,
436 ref ? "Fixed spelling: %s to %s from %s"
437 : "Fixed spelling: %s to %s",
440 return HTTP_MOVED_PERMANENTLY;
443 * Otherwise, a "[300] Multiple Choices" list with the variants is
449 apr_pool_t *sub_pool;
450 apr_array_header_t *t;
451 apr_array_header_t *v;
454 if (r->main == NULL) {
460 notes = r->main->notes;
463 if (apr_pool_create(&sub_pool, p) != APR_SUCCESS)
466 t = apr_array_make(sub_pool, candidates->nelts * 8 + 8,
468 v = apr_array_make(sub_pool, candidates->nelts * 5,
471 /* Generate the response text. */
473 *(const char **)apr_array_push(t) =
474 "The document name you requested (<code>";
475 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, r->uri);
476 *(const char **)apr_array_push(t) =
477 "</code>) could not be found on this server.\n"
478 "However, we found documents with names similar "
479 "to the one you requested.<p>"
480 "Available documents:\n<ul>\n";
482 for (i = 0; i < candidates->nelts; ++i) {
486 reason = sp_reason_str[(int) (variant[i].quality)];
487 /* The format isn't very neat... */
488 vuri = apr_pstrcat(sub_pool, url, variant[i].name, r->path_info,
489 (r->parsed_uri.query != NULL) ? "?" : "",
490 (r->parsed_uri.query != NULL)
491 ? r->parsed_uri.query : "",
493 *(const char **)apr_array_push(v) = "\"";
494 *(const char **)apr_array_push(v) = ap_escape_uri(sub_pool, vuri);
495 *(const char **)apr_array_push(v) = "\";\"";
496 *(const char **)apr_array_push(v) = reason;
497 *(const char **)apr_array_push(v) = "\"";
499 *(const char **)apr_array_push(t) = "<li><a href=\"";
500 *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, vuri);
501 *(const char **)apr_array_push(t) = "\">";
502 *(const char **)apr_array_push(t) = ap_escape_html(sub_pool, vuri);
503 *(const char **)apr_array_push(t) = "</a> (";
504 *(const char **)apr_array_push(t) = reason;
505 *(const char **)apr_array_push(t) = ")\n";
508 * when we have printed the "close matches" and there are
509 * more "distant matches" (matched by stripping the suffix),
510 * then we insert an additional separator text to suggest
511 * that the user LOOK CLOSELY whether these are really the
514 if (i > 0 && i < candidates->nelts - 1
515 && variant[i].quality != SP_VERYDIFFERENT
516 && variant[i + 1].quality == SP_VERYDIFFERENT) {
517 *(const char **)apr_array_push(t) =
518 "</ul>\nFurthermore, the following related "
519 "documents were found:\n<ul>\n";
522 *(const char **)apr_array_push(t) = "</ul>\n";
524 /* If we know there was a referring page, add a note: */
526 *(const char **)apr_array_push(t) =
527 "Please consider informing the owner of the "
529 *(const char **)apr_array_push(t) = ap_escape_uri(sub_pool, ref);
530 *(const char **)apr_array_push(t) = "\">referring page</a> "
531 "about the broken link.\n";
535 /* Pass our apr_table_t to http_protocol.c (see mod_negotiation): */
536 apr_table_setn(notes, "variant-list", apr_array_pstrcat(p, t, 0));
538 apr_table_mergen(r->subprocess_env, "VARIANTS",
539 apr_array_pstrcat(p, v, ','));
541 apr_pool_destroy(sub_pool);
543 ap_log_rerror(APLOG_MARK, APLOG_NOERRNO | APLOG_INFO, 0, r,
544 ref ? "Spelling fix: %s: %d candidates from %s"
545 : "Spelling fix: %s: %d candidates",
546 r->uri, candidates->nelts, ref);
548 return HTTP_MULTIPLE_CHOICES;
555 static void register_hooks(apr_pool_t *p)
557 ap_hook_fixups(check_speling,NULL,NULL,APR_HOOK_LAST);
560 module AP_MODULE_DECLARE_DATA speling_module =
562 STANDARD20_MODULE_STUFF,
563 create_mconfig_for_directory, /* create per-dir config */
564 NULL, /* merge per-dir config */
565 create_mconfig_for_server, /* server config */
566 NULL, /* merge server config */
567 speling_cmds, /* command apr_table_t */
568 register_hooks /* register hooks */