From d0024844306c7300460b5dc3ffa525052ad96c29 Mon Sep 17 00:00:00 2001 From: David Cramer Date: Tue, 10 Jan 2012 18:36:53 +0000 Subject: [PATCH] Handle exceptional cases listed in the Porter 2 stemming algo --- .../content/search/stemmers/en_stemmer.js | 67 ++++++++++++++++--- 1 file changed, 57 insertions(+), 10 deletions(-) diff --git a/xsl/webhelp/template/content/search/stemmers/en_stemmer.js b/xsl/webhelp/template/content/search/stemmers/en_stemmer.js index 105a6b338..8533b24e2 100644 --- a/xsl/webhelp/template/content/search/stemmers/en_stemmer.js +++ b/xsl/webhelp/template/content/search/stemmers/en_stemmer.js @@ -6,9 +6,8 @@ // // see also http://www.tartarus.org/~martin/PorterStemmer -// Release 1 be 'andargor', Jul 2004 -// Release 2 (substantially revised) by Christopher McKenzie, Aug 2009 - +// Release 1 +// Derived from (http://tartarus.org/~martin/PorterStemmer/js.txt) - cjm (iizuu) Aug 24, 2009 var stemmer = (function(){ var step2list = { @@ -98,19 +97,17 @@ var stemmer = (function(){ re2 = /(at|bl|iz)$/; re3 = new RegExp("([^aeiouylsz])\\1$"); re4 = new RegExp("^" + C + v + "[^aeiouwxy]$"); - if (re2.test(w)) { w = w + "e"; } + if (re2.test(w)) { w = w + "e"; } else if (re3.test(w)) { re = /.$/; w = w.replace(re,""); } else if (re4.test(w)) { w = w + "e"; } } } // Step 1c - re = new RegExp("^(.+?" + C + ")y$"); - if (re.test(w) && w != "say") { + re = new RegExp("^(.+" + c + ")y$"); + if (re.test(w)) { var fp = re.exec(w); stem = fp[1]; - //re = new RegExp(s_v); - //if (re.test(stem)) { w = stem + "i"; } w = stem + "i"; } @@ -183,6 +180,56 @@ var stemmer = (function(){ w = firstch.toLowerCase() + w.substr(1); } - return w; + // See http://snowball.tartarus.org/algorithms/english/stemmer.html + // "Exceptional forms in general" + var specialWords = { + "skis" : "ski", + "skies" : "sky", + "dying" : "die", + "lying" : "lie", + "tying" : "tie", + "idly" : "idl", + "gently" : "gentl", + "ugly" : "ugli", + "early": "earli", + "only": "onli", + "singly": "singl" + }; + + if(specialWords[origword]){ + w = specialWords[origword]; + } + + if( "sky news howe atlas cosmos bias \ + andes inning outing canning herring \ + earring proceed exceed succeed".indexOf(origword) !== -1 ){ + w = origword; + } + + // These are all overstemmed as gener- + // What about commun- words? + re = /.*generate?s?d?(ing)?$/; + if( re.test(origword) ){ + w = w + 'at'; + } + re = /.*general(ly)?$/; + if( re.test(origword) ){ + w = w + 'al'; + } + re = /.*generic(ally)?$/; + if( re.test(origword) ){ + w = w + 'ic'; + } + re = /.*generous(ly)?$/; + if( re.test(origword) ){ + w = w + 'ous'; + } + // These are overstemmed as commun- + re = /.*communit(ies)?y?/; + if( re.test(origword) ){ + w = w + 'iti'; + } + + return w; } -})(); \ No newline at end of file +})(); -- 2.40.0