From ceaed7d4c68a4f4e2a03a1a81a19a0df9bf15837 Mon Sep 17 00:00:00 2001 From: Kasun Gajasinghe Date: Tue, 10 Aug 2010 15:57:31 +0000 Subject: [PATCH] Added stemming support for French(fr). Wrote the stemmer for client-side using french language algorithm in Snowball. Committing files necessary for UI, which were missing in the previous commit. --- .../com/nexwave/nquindexer/IndexerTask.java | 4 +- .../com/nexwave/nquindexer/SaxHTMLIndex.java | 3 + .../com/nexwave/stemmer/snowball/TestApp.java | 5 +- ...{frenchStemmer.java => FrenchStemmer.java} | 298 ++++++++--------- .../{gov-header-bg.gif => header-bg.gif} | Bin .../template/common/images/highlight-blue.gif | Bin 0 -> 471 bytes .../common/images/highlight-yellow.gif | Bin 0 -> 331 bytes .../content/search/stemmers/fr_stemmer.js | 299 ++++++++++++++++++ 8 files changed, 455 insertions(+), 154 deletions(-) rename xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/{frenchStemmer.java => FrenchStemmer.java} (85%) rename xsl/webhelp/template/common/images/{gov-header-bg.gif => header-bg.gif} (100%) create mode 100644 xsl/webhelp/template/common/images/highlight-blue.gif create mode 100644 xsl/webhelp/template/common/images/highlight-yellow.gif create mode 100644 xsl/webhelp/template/content/search/stemmers/fr_stemmer.js diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java index cfde68342..b926bad19 100755 --- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java +++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java @@ -55,8 +55,8 @@ public class IndexerTask{ //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK, // as stemmers doesn't find a difference between them. - private String[] supportedLanguages= {"en", "de", "cn", "ja", "ko"}; //currently extended support available for - // English, German, and CJK (Chinese, Japanese, Korean) languages only. + private String[] supportedLanguages= {"en", "de", "fr", "cn", "ja", "ko"}; //currently extended support available for + // English, German, French and CJK (Chinese, Japanese, Korean) languages only. // Indexing features: words to remove private ArrayList cleanUpStrings = null; diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java index a83453d96..d9538aed5 100755 --- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java +++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java @@ -12,6 +12,7 @@ import com.nexwave.nsidita.DocFileInfo; //Stemmers import com.nexwave.stemmer.snowball.SnowballStemmer; import com.nexwave.stemmer.snowball.ext.EnglishStemmer; +import com.nexwave.stemmer.snowball.ext.FrenchStemmer; import com.nexwave.stemmer.snowball.ext.GermanStemmer; //CJK Tokenizing @@ -128,6 +129,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{ stemmer = new EnglishStemmer(); } else if (indexerLanguage.equalsIgnoreCase("de")){ stemmer= new GermanStemmer(); + } else if (indexerLanguage.equalsIgnoreCase("fr")){ + stemmer= new FrenchStemmer(); } else { stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used. } diff --git a/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/TestApp.java b/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/TestApp.java index 0a251a46a..e83d6f499 100755 --- a/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/TestApp.java +++ b/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/TestApp.java @@ -1,8 +1,7 @@ package com.nexwave.stemmer.snowball; -import com.nexwave.stemmer.snowball.ext.EnglishStemmer; -import com.nexwave.stemmer.snowball.ext.frenchStemmer; +import com.nexwave.stemmer.snowball.ext.FrenchStemmer; public class TestApp { private static void usage() @@ -19,7 +18,7 @@ public class TestApp { Class stemClass = Class.forName("com.nexwave.stemmer.snowball.ext." + args[0] + "Stemmer");*/ // SnowballStemmer stemmer = (SnowballStemmer) stemClass.newInstance(); - SnowballStemmer stemmer = new frenchStemmer();//new EnglishStemmer();//= new GermanStemmer(); + SnowballStemmer stemmer = new FrenchStemmer();//new EnglishStemmer();//= new GermanStemmer(); StringBuffer input = new StringBuffer(); /*Reader reader; diff --git a/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/frenchStemmer.java b/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/FrenchStemmer.java similarity index 85% rename from xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/frenchStemmer.java rename to xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/FrenchStemmer.java index f60770ed2..27e8001a5 100755 --- a/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/frenchStemmer.java +++ b/xsl/webhelp/indexer/src/com/nexwave/stemmer/snowball/ext/FrenchStemmer.java @@ -9,181 +9,181 @@ import com.nexwave.stemmer.snowball.Among; * It implements the stemming algorithm defined by a snowball script. */ -public class frenchStemmer extends com.nexwave.stemmer.snowball.SnowballStemmer { +public class FrenchStemmer extends com.nexwave.stemmer.snowball.SnowballStemmer { private static final long serialVersionUID = 1L; - private final static frenchStemmer methodObject = new frenchStemmer (); + private final static FrenchStemmer METHOD_OBJECT = new FrenchStemmer(); private final static Among a_0[] = { - new Among ( "col", -1, -1, "", methodObject ), - new Among ( "par", -1, -1, "", methodObject ), - new Among ( "tap", -1, -1, "", methodObject ) + new Among ( "col", -1, -1, "", METHOD_OBJECT), + new Among ( "par", -1, -1, "", METHOD_OBJECT), + new Among ( "tap", -1, -1, "", METHOD_OBJECT) }; private final static Among a_1[] = { - new Among ( "", -1, 4, "", methodObject ), - new Among ( "I", 0, 1, "", methodObject ), - new Among ( "U", 0, 2, "", methodObject ), - new Among ( "Y", 0, 3, "", methodObject ) + new Among ( "", -1, 4, "", METHOD_OBJECT), + new Among ( "I", 0, 1, "", METHOD_OBJECT), + new Among ( "U", 0, 2, "", METHOD_OBJECT), + new Among ( "Y", 0, 3, "", METHOD_OBJECT) }; private final static Among a_2[] = { - new Among ( "iqU", -1, 3, "", methodObject ), - new Among ( "abl", -1, 3, "", methodObject ), - new Among ( "I\u00E8r", -1, 4, "", methodObject ), - new Among ( "i\u00E8r", -1, 4, "", methodObject ), - new Among ( "eus", -1, 2, "", methodObject ), - new Among ( "iv", -1, 1, "", methodObject ) + new Among ( "iqU", -1, 3, "", METHOD_OBJECT), + new Among ( "abl", -1, 3, "", METHOD_OBJECT), + new Among ( "I\u00E8r", -1, 4, "", METHOD_OBJECT), + new Among ( "i\u00E8r", -1, 4, "", METHOD_OBJECT), + new Among ( "eus", -1, 2, "", METHOD_OBJECT), + new Among ( "iv", -1, 1, "", METHOD_OBJECT) }; private final static Among a_3[] = { - new Among ( "ic", -1, 2, "", methodObject ), - new Among ( "abil", -1, 1, "", methodObject ), - new Among ( "iv", -1, 3, "", methodObject ) + new Among ( "ic", -1, 2, "", METHOD_OBJECT), + new Among ( "abil", -1, 1, "", METHOD_OBJECT), + new Among ( "iv", -1, 3, "", METHOD_OBJECT) }; private final static Among a_4[] = { - new Among ( "iqUe", -1, 1, "", methodObject ), - new Among ( "atrice", -1, 2, "", methodObject ), - new Among ( "ance", -1, 1, "", methodObject ), - new Among ( "ence", -1, 5, "", methodObject ), - new Among ( "logie", -1, 3, "", methodObject ), - new Among ( "able", -1, 1, "", methodObject ), - new Among ( "isme", -1, 1, "", methodObject ), - new Among ( "euse", -1, 11, "", methodObject ), - new Among ( "iste", -1, 1, "", methodObject ), - new Among ( "ive", -1, 8, "", methodObject ), - new Among ( "if", -1, 8, "", methodObject ), - new Among ( "usion", -1, 4, "", methodObject ), - new Among ( "ation", -1, 2, "", methodObject ), - new Among ( "ution", -1, 4, "", methodObject ), - new Among ( "ateur", -1, 2, "", methodObject ), - new Among ( "iqUes", -1, 1, "", methodObject ), - new Among ( "atrices", -1, 2, "", methodObject ), - new Among ( "ances", -1, 1, "", methodObject ), - new Among ( "ences", -1, 5, "", methodObject ), - new Among ( "logies", -1, 3, "", methodObject ), - new Among ( "ables", -1, 1, "", methodObject ), - new Among ( "ismes", -1, 1, "", methodObject ), - new Among ( "euses", -1, 11, "", methodObject ), - new Among ( "istes", -1, 1, "", methodObject ), - new Among ( "ives", -1, 8, "", methodObject ), - new Among ( "ifs", -1, 8, "", methodObject ), - new Among ( "usions", -1, 4, "", methodObject ), - new Among ( "ations", -1, 2, "", methodObject ), - new Among ( "utions", -1, 4, "", methodObject ), - new Among ( "ateurs", -1, 2, "", methodObject ), - new Among ( "ments", -1, 15, "", methodObject ), - new Among ( "ements", 30, 6, "", methodObject ), - new Among ( "issements", 31, 12, "", methodObject ), - new Among ( "it\u00E9s", -1, 7, "", methodObject ), - new Among ( "ment", -1, 15, "", methodObject ), - new Among ( "ement", 34, 6, "", methodObject ), - new Among ( "issement", 35, 12, "", methodObject ), - new Among ( "amment", 34, 13, "", methodObject ), - new Among ( "emment", 34, 14, "", methodObject ), - new Among ( "aux", -1, 10, "", methodObject ), - new Among ( "eaux", 39, 9, "", methodObject ), - new Among ( "eux", -1, 1, "", methodObject ), - new Among ( "it\u00E9", -1, 7, "", methodObject ) + new Among ( "iqUe", -1, 1, "", METHOD_OBJECT), + new Among ( "atrice", -1, 2, "", METHOD_OBJECT), + new Among ( "ance", -1, 1, "", METHOD_OBJECT), + new Among ( "ence", -1, 5, "", METHOD_OBJECT), + new Among ( "logie", -1, 3, "", METHOD_OBJECT), + new Among ( "able", -1, 1, "", METHOD_OBJECT), + new Among ( "isme", -1, 1, "", METHOD_OBJECT), + new Among ( "euse", -1, 11, "", METHOD_OBJECT), + new Among ( "iste", -1, 1, "", METHOD_OBJECT), + new Among ( "ive", -1, 8, "", METHOD_OBJECT), + new Among ( "if", -1, 8, "", METHOD_OBJECT), + new Among ( "usion", -1, 4, "", METHOD_OBJECT), + new Among ( "ation", -1, 2, "", METHOD_OBJECT), + new Among ( "ution", -1, 4, "", METHOD_OBJECT), + new Among ( "ateur", -1, 2, "", METHOD_OBJECT), + new Among ( "iqUes", -1, 1, "", METHOD_OBJECT), + new Among ( "atrices", -1, 2, "", METHOD_OBJECT), + new Among ( "ances", -1, 1, "", METHOD_OBJECT), + new Among ( "ences", -1, 5, "", METHOD_OBJECT), + new Among ( "logies", -1, 3, "", METHOD_OBJECT), + new Among ( "ables", -1, 1, "", METHOD_OBJECT), + new Among ( "ismes", -1, 1, "", METHOD_OBJECT), + new Among ( "euses", -1, 11, "", METHOD_OBJECT), + new Among ( "istes", -1, 1, "", METHOD_OBJECT), + new Among ( "ives", -1, 8, "", METHOD_OBJECT), + new Among ( "ifs", -1, 8, "", METHOD_OBJECT), + new Among ( "usions", -1, 4, "", METHOD_OBJECT), + new Among ( "ations", -1, 2, "", METHOD_OBJECT), + new Among ( "utions", -1, 4, "", METHOD_OBJECT), + new Among ( "ateurs", -1, 2, "", METHOD_OBJECT), + new Among ( "ments", -1, 15, "", METHOD_OBJECT), + new Among ( "ements", 30, 6, "", METHOD_OBJECT), + new Among ( "issements", 31, 12, "", METHOD_OBJECT), + new Among ( "it\u00E9s", -1, 7, "", METHOD_OBJECT), + new Among ( "ment", -1, 15, "", METHOD_OBJECT), + new Among ( "ement", 34, 6, "", METHOD_OBJECT), + new Among ( "issement", 35, 12, "", METHOD_OBJECT), + new Among ( "amment", 34, 13, "", METHOD_OBJECT), + new Among ( "emment", 34, 14, "", METHOD_OBJECT), + new Among ( "aux", -1, 10, "", METHOD_OBJECT), + new Among ( "eaux", 39, 9, "", METHOD_OBJECT), + new Among ( "eux", -1, 1, "", METHOD_OBJECT), + new Among ( "it\u00E9", -1, 7, "", METHOD_OBJECT) }; private final static Among a_5[] = { - new Among ( "ira", -1, 1, "", methodObject ), - new Among ( "ie", -1, 1, "", methodObject ), - new Among ( "isse", -1, 1, "", methodObject ), - new Among ( "issante", -1, 1, "", methodObject ), - new Among ( "i", -1, 1, "", methodObject ), - new Among ( "irai", 4, 1, "", methodObject ), - new Among ( "ir", -1, 1, "", methodObject ), - new Among ( "iras", -1, 1, "", methodObject ), - new Among ( "ies", -1, 1, "", methodObject ), - new Among ( "\u00EEmes", -1, 1, "", methodObject ), - new Among ( "isses", -1, 1, "", methodObject ), - new Among ( "issantes", -1, 1, "", methodObject ), - new Among ( "\u00EEtes", -1, 1, "", methodObject ), - new Among ( "is", -1, 1, "", methodObject ), - new Among ( "irais", 13, 1, "", methodObject ), - new Among ( "issais", 13, 1, "", methodObject ), - new Among ( "irions", -1, 1, "", methodObject ), - new Among ( "issions", -1, 1, "", methodObject ), - new Among ( "irons", -1, 1, "", methodObject ), - new Among ( "issons", -1, 1, "", methodObject ), - new Among ( "issants", -1, 1, "", methodObject ), - new Among ( "it", -1, 1, "", methodObject ), - new Among ( "irait", 21, 1, "", methodObject ), - new Among ( "issait", 21, 1, "", methodObject ), - new Among ( "issant", -1, 1, "", methodObject ), - new Among ( "iraIent", -1, 1, "", methodObject ), - new Among ( "issaIent", -1, 1, "", methodObject ), - new Among ( "irent", -1, 1, "", methodObject ), - new Among ( "issent", -1, 1, "", methodObject ), - new Among ( "iront", -1, 1, "", methodObject ), - new Among ( "\u00EEt", -1, 1, "", methodObject ), - new Among ( "iriez", -1, 1, "", methodObject ), - new Among ( "issiez", -1, 1, "", methodObject ), - new Among ( "irez", -1, 1, "", methodObject ), - new Among ( "issez", -1, 1, "", methodObject ) + new Among ( "ira", -1, 1, "", METHOD_OBJECT), + new Among ( "ie", -1, 1, "", METHOD_OBJECT), + new Among ( "isse", -1, 1, "", METHOD_OBJECT), + new Among ( "issante", -1, 1, "", METHOD_OBJECT), + new Among ( "i", -1, 1, "", METHOD_OBJECT), + new Among ( "irai", 4, 1, "", METHOD_OBJECT), + new Among ( "ir", -1, 1, "", METHOD_OBJECT), + new Among ( "iras", -1, 1, "", METHOD_OBJECT), + new Among ( "ies", -1, 1, "", METHOD_OBJECT), + new Among ( "\u00EEmes", -1, 1, "", METHOD_OBJECT), + new Among ( "isses", -1, 1, "", METHOD_OBJECT), + new Among ( "issantes", -1, 1, "", METHOD_OBJECT), + new Among ( "\u00EEtes", -1, 1, "", METHOD_OBJECT), + new Among ( "is", -1, 1, "", METHOD_OBJECT), + new Among ( "irais", 13, 1, "", METHOD_OBJECT), + new Among ( "issais", 13, 1, "", METHOD_OBJECT), + new Among ( "irions", -1, 1, "", METHOD_OBJECT), + new Among ( "issions", -1, 1, "", METHOD_OBJECT), + new Among ( "irons", -1, 1, "", METHOD_OBJECT), + new Among ( "issons", -1, 1, "", METHOD_OBJECT), + new Among ( "issants", -1, 1, "", METHOD_OBJECT), + new Among ( "it", -1, 1, "", METHOD_OBJECT), + new Among ( "irait", 21, 1, "", METHOD_OBJECT), + new Among ( "issait", 21, 1, "", METHOD_OBJECT), + new Among ( "issant", -1, 1, "", METHOD_OBJECT), + new Among ( "iraIent", -1, 1, "", METHOD_OBJECT), + new Among ( "issaIent", -1, 1, "", METHOD_OBJECT), + new Among ( "irent", -1, 1, "", METHOD_OBJECT), + new Among ( "issent", -1, 1, "", METHOD_OBJECT), + new Among ( "iront", -1, 1, "", METHOD_OBJECT), + new Among ( "\u00EEt", -1, 1, "", METHOD_OBJECT), + new Among ( "iriez", -1, 1, "", METHOD_OBJECT), + new Among ( "issiez", -1, 1, "", METHOD_OBJECT), + new Among ( "irez", -1, 1, "", METHOD_OBJECT), + new Among ( "issez", -1, 1, "", METHOD_OBJECT) }; private final static Among a_6[] = { - new Among ( "a", -1, 3, "", methodObject ), - new Among ( "era", 0, 2, "", methodObject ), - new Among ( "asse", -1, 3, "", methodObject ), - new Among ( "ante", -1, 3, "", methodObject ), - new Among ( "\u00E9e", -1, 2, "", methodObject ), - new Among ( "ai", -1, 3, "", methodObject ), - new Among ( "erai", 5, 2, "", methodObject ), - new Among ( "er", -1, 2, "", methodObject ), - new Among ( "as", -1, 3, "", methodObject ), - new Among ( "eras", 8, 2, "", methodObject ), - new Among ( "\u00E2mes", -1, 3, "", methodObject ), - new Among ( "asses", -1, 3, "", methodObject ), - new Among ( "antes", -1, 3, "", methodObject ), - new Among ( "\u00E2tes", -1, 3, "", methodObject ), - new Among ( "\u00E9es", -1, 2, "", methodObject ), - new Among ( "ais", -1, 3, "", methodObject ), - new Among ( "erais", 15, 2, "", methodObject ), - new Among ( "ions", -1, 1, "", methodObject ), - new Among ( "erions", 17, 2, "", methodObject ), - new Among ( "assions", 17, 3, "", methodObject ), - new Among ( "erons", -1, 2, "", methodObject ), - new Among ( "ants", -1, 3, "", methodObject ), - new Among ( "\u00E9s", -1, 2, "", methodObject ), - new Among ( "ait", -1, 3, "", methodObject ), - new Among ( "erait", 23, 2, "", methodObject ), - new Among ( "ant", -1, 3, "", methodObject ), - new Among ( "aIent", -1, 3, "", methodObject ), - new Among ( "eraIent", 26, 2, "", methodObject ), - new Among ( "\u00E8rent", -1, 2, "", methodObject ), - new Among ( "assent", -1, 3, "", methodObject ), - new Among ( "eront", -1, 2, "", methodObject ), - new Among ( "\u00E2t", -1, 3, "", methodObject ), - new Among ( "ez", -1, 2, "", methodObject ), - new Among ( "iez", 32, 2, "", methodObject ), - new Among ( "eriez", 33, 2, "", methodObject ), - new Among ( "assiez", 33, 3, "", methodObject ), - new Among ( "erez", 32, 2, "", methodObject ), - new Among ( "\u00E9", -1, 2, "", methodObject ) + new Among ( "a", -1, 3, "", METHOD_OBJECT), + new Among ( "era", 0, 2, "", METHOD_OBJECT), + new Among ( "asse", -1, 3, "", METHOD_OBJECT), + new Among ( "ante", -1, 3, "", METHOD_OBJECT), + new Among ( "\u00E9e", -1, 2, "", METHOD_OBJECT), + new Among ( "ai", -1, 3, "", METHOD_OBJECT), + new Among ( "erai", 5, 2, "", METHOD_OBJECT), + new Among ( "er", -1, 2, "", METHOD_OBJECT), + new Among ( "as", -1, 3, "", METHOD_OBJECT), + new Among ( "eras", 8, 2, "", METHOD_OBJECT), + new Among ( "\u00E2mes", -1, 3, "", METHOD_OBJECT), + new Among ( "asses", -1, 3, "", METHOD_OBJECT), + new Among ( "antes", -1, 3, "", METHOD_OBJECT), + new Among ( "\u00E2tes", -1, 3, "", METHOD_OBJECT), + new Among ( "\u00E9es", -1, 2, "", METHOD_OBJECT), + new Among ( "ais", -1, 3, "", METHOD_OBJECT), + new Among ( "erais", 15, 2, "", METHOD_OBJECT), + new Among ( "ions", -1, 1, "", METHOD_OBJECT), + new Among ( "erions", 17, 2, "", METHOD_OBJECT), + new Among ( "assions", 17, 3, "", METHOD_OBJECT), + new Among ( "erons", -1, 2, "", METHOD_OBJECT), + new Among ( "ants", -1, 3, "", METHOD_OBJECT), + new Among ( "\u00E9s", -1, 2, "", METHOD_OBJECT), + new Among ( "ait", -1, 3, "", METHOD_OBJECT), + new Among ( "erait", 23, 2, "", METHOD_OBJECT), + new Among ( "ant", -1, 3, "", METHOD_OBJECT), + new Among ( "aIent", -1, 3, "", METHOD_OBJECT), + new Among ( "eraIent", 26, 2, "", METHOD_OBJECT), + new Among ( "\u00E8rent", -1, 2, "", METHOD_OBJECT), + new Among ( "assent", -1, 3, "", METHOD_OBJECT), + new Among ( "eront", -1, 2, "", METHOD_OBJECT), + new Among ( "\u00E2t", -1, 3, "", METHOD_OBJECT), + new Among ( "ez", -1, 2, "", METHOD_OBJECT), + new Among ( "iez", 32, 2, "", METHOD_OBJECT), + new Among ( "eriez", 33, 2, "", METHOD_OBJECT), + new Among ( "assiez", 33, 3, "", METHOD_OBJECT), + new Among ( "erez", 32, 2, "", METHOD_OBJECT), + new Among ( "\u00E9", -1, 2, "", METHOD_OBJECT) }; private final static Among a_7[] = { - new Among ( "e", -1, 3, "", methodObject ), - new Among ( "I\u00E8re", 0, 2, "", methodObject ), - new Among ( "i\u00E8re", 0, 2, "", methodObject ), - new Among ( "ion", -1, 1, "", methodObject ), - new Among ( "Ier", -1, 2, "", methodObject ), - new Among ( "ier", -1, 2, "", methodObject ), - new Among ( "\u00EB", -1, 4, "", methodObject ) + new Among ( "e", -1, 3, "", METHOD_OBJECT), + new Among ( "I\u00E8re", 0, 2, "", METHOD_OBJECT), + new Among ( "i\u00E8re", 0, 2, "", METHOD_OBJECT), + new Among ( "ion", -1, 1, "", METHOD_OBJECT), + new Among ( "Ier", -1, 2, "", METHOD_OBJECT), + new Among ( "ier", -1, 2, "", METHOD_OBJECT), + new Among ( "\u00EB", -1, 4, "", METHOD_OBJECT) }; private final static Among a_8[] = { - new Among ( "ell", -1, -1, "", methodObject ), - new Among ( "eill", -1, -1, "", methodObject ), - new Among ( "enn", -1, -1, "", methodObject ), - new Among ( "onn", -1, -1, "", methodObject ), - new Among ( "ett", -1, -1, "", methodObject ) + new Among ( "ell", -1, -1, "", METHOD_OBJECT), + new Among ( "eill", -1, -1, "", METHOD_OBJECT), + new Among ( "enn", -1, -1, "", METHOD_OBJECT), + new Among ( "onn", -1, -1, "", METHOD_OBJECT), + new Among ( "ett", -1, -1, "", METHOD_OBJECT) }; private static final char g_v[] = {17, 65, 16, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 128, 130, 103, 8, 5 }; @@ -194,7 +194,7 @@ private static final long serialVersionUID = 1L; private int I_p1; private int I_pV; - private void copy_from(frenchStemmer other) { + private void copy_from(FrenchStemmer other) { I_p2 = other.I_p2; I_p1 = other.I_p1; I_pV = other.I_pV; @@ -1507,11 +1507,11 @@ private static final long serialVersionUID = 1L; } public boolean equals( Object o ) { - return o instanceof frenchStemmer; + return o instanceof FrenchStemmer; } public int hashCode() { - return frenchStemmer.class.getName().hashCode(); + return FrenchStemmer.class.getName().hashCode(); } diff --git a/xsl/webhelp/template/common/images/gov-header-bg.gif b/xsl/webhelp/template/common/images/header-bg.gif similarity index 100% rename from xsl/webhelp/template/common/images/gov-header-bg.gif rename to xsl/webhelp/template/common/images/header-bg.gif diff --git a/xsl/webhelp/template/common/images/highlight-blue.gif b/xsl/webhelp/template/common/images/highlight-blue.gif new file mode 100644 index 0000000000000000000000000000000000000000..4fdabde69252ba43e8b658c74a72775caaf304f4 GIT binary patch literal 471 zcmZ?wbhEHb)L_tNI3mh0b>*rn_bz?>^5NvQ+|z#z(?15yRD zlY#Yrf@+_NCr7*Rnu7}!QgzNx-55}@F4$S|f`i~ggBz0NjqwI*2{Q9{vnE;=IjrB+ zDCD7$u;0?zfI%<#i1GTJdsHRM1Uh5*=AKsEeQ?SYjaMqdjg0|KjY4kRs^-=Wtu3wX zZqXd>y-od16RmlavwB*EXSDg-v-nSI_Fpi|n$>jqfB1PxT?^`t?fQfg*PWROT z?4=7Bj-NPj$bb8e1?SIiu}ay0;o4&VD^suExnz0EZN;6_?hnqjm6bhlW2}3)ao5bl z&nzEpx@W<5 literal 0 HcmV?d00001 diff --git a/xsl/webhelp/template/common/images/highlight-yellow.gif b/xsl/webhelp/template/common/images/highlight-yellow.gif new file mode 100644 index 0000000000000000000000000000000000000000..3e847e7e01623b08e9a7e7d823fcb97104e76194 GIT binary patch literal 331 zcmV-R0kr-{Nk%w1VJrYF0HOc@`{bSf|Nr^cZTH48_|8oJ^wtd=`2yXqN@dAW)FA{*9 z0^lo>f-$0TSwVV~Os3NTjY>HMo^?sUEofV-pivzGA!zUjgNA5FQ#EN!ZcP>Pwa!Lg z*&TUsLwr0%X>V dfG2i6;P)aBqN^Dbjd|Rq0Yr!k5L_hy06YHIk5T{t literal 0 HcmV?d00001 diff --git a/xsl/webhelp/template/content/search/stemmers/fr_stemmer.js b/xsl/webhelp/template/content/search/stemmers/fr_stemmer.js new file mode 100644 index 000000000..26b89c8f6 --- /dev/null +++ b/xsl/webhelp/template/content/search/stemmers/fr_stemmer.js @@ -0,0 +1,299 @@ +/* + * Author: Kasun Gajasinghe + * E-Mail: kasunbg@gmail.com + * Date: 09.08.2010 + * + * usage: stemmer(word); + * ex: var stem = stemmer(foobar); + * Implementation of the stemming algorithm from http://snowball.tartarus.org/algorithms/french/stemmer.html + * + * LICENSE: + * + * Copyright (c) 2010, Kasun Gajasinghe. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * + * THIS SOFTWARE IS PROVIDED BY KASUN GAJASINGHE ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL KASUN GAJASINGHE BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +var stemmer = function(word){ +// Letters in French include the following accented forms, +// â à ç ë é ê è ï î ô û ù +// The following letters are vowels: +// a e i o u y â à ë é ê è ï î ô û ù + + word = word.toLowerCase(); + var oriWord = word; + word = word.replace(/qu/g, 'qU'); //have to perform first, as after the operation, capital U is not treated as a vowel + word = word.replace(/([aeiouyâàëéêèïîôûù])u([aeiouyâàëéêèïîôûù])/g, '$1U$2'); + word = word.replace(/([aeiouyâàëéêèïîôûù])i([aeiouyâàëéêèïîôûù])/g, '$1I$2'); + word = word.replace(/([aeiouyâàëéêèïîôûù])y/g, '$1Y'); + word = word.replace(/y([aeiouyâàëéêèïîôûù])/g, 'Y$1'); + + var rv=''; + var rvIndex = -1; + if(word.search(/^(par|col|tap)/) != -1 || word.search(/^[aeiouyâàëéêèïîôûù]{2}/) != -1){ + rv = word.substring(3); + rvIndex = 3; + } else { + rvIndex = word.substring(1).search(/[aeiouyâàëéêèïîôûù]/); + if(rvIndex != -1){ + rvIndex +=2; //+2 is to supplement the substring(1) used to find rvIndex + rv = word.substring(rvIndex); + } else { + rvIndex = word.length; + } + } + +// R1 is the region after the first non-vowel following a vowel, or the end of the word if there is no such non-vowel. +// R2 is the region after the first non-vowel following a vowel in R1, or the end of the word if there is no such non-vowel + var r1Index = word.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/); + var r1 = ''; + if (r1Index != -1) { + r1Index += 2; + r1 = word.substring(r1Index); + } else { + r1Index = word.length; + } + + var r2Index = -1; + var r2 = ''; + if (r1Index != -1) { + r2Index = r1.search(/[aeiouyâàëéêèïîôûù][^aeiouyâàëéêèïîôûù]/); + if (r2Index != -1) { + r2Index += 2; + r2 = r1.substring(r2Index); + r2Index += r1Index; + } else { + r2 = ''; + r2Index = word.length; + } + } + if (r1Index != -1 && r1Index < 3) { + r1Index = 3; + r1 = word.substring(r1Index); + } + + /* + Step 1: Standard suffix removal + */ + var a1Index = word.search(/(ance|iqUe|isme|able|iste|eux|ances|iqUes|ismes|ables|istes)$/); + var a2Index = word.search(/(atrice|ateur|ation|atrices|ateurs|ations)$/); + var a3Index = word.search(/(logie|logies)$/); + var a4Index = word.search(/(usion|ution|usions|utions)$/); + var a5Index = word.search(/(ence|ences)$/); + var a6Index = word.search(/(ement|ements)$/); + var a7Index = word.search(/(ité|ités)$/); + var a8Index = word.search(/(if|ive|ifs|ives)$/); + var a9Index = word.search(/(eaux)$/); + var a10Index = word.search(/(aux)$/); + var a11Index = word.search(/(euse|euses)$/); + var a12Index = word.search(/[^aeiouyâàëéêèïîôûù](issement|issements)$/); + var a13Index = word.search(/(amment)$/); + var a14Index = word.search(/(emment)$/); + var a15Index = word.search(/[aeiouyâàëéêèïîôûù](ment|ments)$/); + + if(a1Index != -1 && a1Index >= r2Index){ + word = word.substring(0,a1Index); + } else if(a2Index != -1 && a2Index >= r2Index){ + word = word.substring(0,a2Index); + var a2Index2 = word.search(/(ic)$/); + if(a2Index2 != -1 && a2Index2 >= r2Index){ + word = word.substring(0, a2Index2); //if preceded by ic, delete if in R2, + } else { //else replace by iqU + word = word.replace(/(ic)$/,'iqU'); + } + } else if(a3Index != -1 && a3Index >= r2Index){ + word = word.replace(/(logie|logies)$/,'log'); //replace with log if in R2 + } else if(a4Index != -1 && a4Index >= r2Index){ + word = word.replace(/(usion|ution|usions|utions)$/,'u'); //replace with u if in R2 + } else if(a5Index != -1 && a5Index >= r2Index){ + word = word.replace(/(ence|ences)$/,'ent'); //replace with ent if in R2 + } else if(a6Index != -1 && a6Index >= rvIndex){ + word = word.substring(0,a6Index); + if(word.search(/(iv)$/) >= r2Index){ + word = word.replace(/(iv)$/, ''); + if(word.search(/(at)$/) >= r2Index){ + word = word.replace(/(at)$/, ''); + } + } else if(word.search(/(eus)$/) != -1){ + var a6Index2 = word.search(/(eus)$/); + if(a6Index2 >=r2Index){ + word = word.substring(0, a6Index2); + } else if(a6Index2 >= r1Index){ + word = word.substring(0,a6Index2)+"eux"; + } + } else if(word.search(/(abl|iqU)$/) >= r2Index){ + word = word.replace(/(abl|iqU)$/,''); //if preceded by abl or iqU, delete if in R2, + } else if(word.search(/(ièr|Ièr)$/) >= rvIndex){ + word = word.replace(/(ièr|Ièr)$/,'i'); //if preceded by abl or iqU, delete if in R2, + } + } else if(a7Index != -1 && a7Index >= r2Index){ + word = word.substring(0,a7Index); //delete if in R2 + if(word.search(/(abil)$/) != -1){ //if preceded by abil, delete if in R2, else replace by abl, otherwise, + var a7Index2 = word.search(/(abil)$/); + if(a7Index2 >=r2Index){ + word = word.substring(0, a7Index2); + } else { + word = word.substring(0,a7Index2)+"abl"; + } + } else if(word.search(/(ic)$/) != -1){ + var a7Index3 = word.search(/(ic)$/); + if(a7Index3 != -1 && a7Index3 >= r2Index){ + word = word.substring(0, a7Index3); //if preceded by ic, delete if in R2, + } else { //else replace by iqU + word = word.replace(/(ic)$/,'iqU'); + } + } else if(word.search(/(iv)$/) != r2Index){ + word = word.replace(/(iv)$/,''); + } + } else if(a8Index != -1 && a8Index >= r2Index){ + word = word.substring(0,a8Index); + if(word.search(/(at)$/) >= r2Index){ + word = word.replace(/(at)$/, ''); + if(word.search(/(ic)$/) >= r2Index){ + word = word.replace(/(ic)$/, ''); + } else { word = word.replace(/(ic)$/, 'iqU'); } + } + } else if(a9Index != -1){ word = word.replace(/(eaux)/,'eau') + } else if(a10Index >= r1Index){ word = word.replace(/(aux)/,'al') + } else if(a11Index != -1 ){ + var a11Index2 = word.search(/(euse|euses)$/); + if(a11Index2 >=r2Index){ + word = word.substring(0, a11Index2); + } else if(a11Index2 >= r1Index){ + word = word.substring(0, a11Index2)+"eux"; + } + } else if(a12Index!=-1 && a12Index>=r1Index){ + word = word.substring(0,a12Index+1); //+1- amendment to non-vowel + } else if(a13Index!=-1 && a13Index>=rvIndex){ + word = word.replace(/(amment)$/,'ant'); + } else if(a14Index!=-1 && a14Index>=rvIndex){ + word = word.replace(/(emment)$/,'ent'); + } else if(a15Index!=-1 && a15Index>=rvIndex){ + word = word.substring(0,a15Index+1); + } + + /* Step 2a: Verb suffixes beginning i*/ + var wordStep1 = word; + var step2aDone = false; + if(oriWord == word.toLowerCase() || oriWord.search(/(amment|emment|ment|ments)$/) != -1){ + step2aDone = true; + var b1Regex = /([^aeiouyâàëéêèïîôûù])(îmes|ît|îtes|i|ie|ies|ir|ira|irai|iraIent|irais|irait|iras|irent|irez|iriez|irions|irons|iront|is|issaIent|issais|issait|issant|issante|issantes|issants|isse|issent|isses|issez|issiez|issions|issons|it)$/i; + if(word.search(b1Regex) >= rvIndex){ + word = word.replace(b1Regex,'$1'); + } + } + + /* Step 2b: Other verb suffixes*/ + if (step2aDone && wordStep1 == word) { + if (word.search(/(ions)$/) >= r2Index) { + word = word.replace(/(ions)$/, ''); + } else { + var b2Regex = /(é|ée|ées|és|èrent|er|era|erai|eraIent|erais|erait|eras|erez|eriez|erions|erons|eront|ez|iez)$/i; + if (word.search(b2Regex) >= rvIndex) { + word = word.replace(b2Regex, ''); + } else { + var b3Regex = /e(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i; + if (word.search(b3Regex) >= rvIndex) { + word = word.replace(b3Regex, ''); + } else { + var b3Regex2 = /(âmes|ât|âtes|a|ai|aIent|ais|ait|ant|ante|antes|ants|as|asse|assent|asses|assiez|assions)$/i; + if (word.search(b3Regex2) >= rvIndex) { + word = word.replace(b3Regex2, ''); + } + } + } + } + } + + if(oriWord != word.toLowerCase()){ + /* Step 3 */ + var rep = ''; + if(word.search(/Y$/) != -1) { + word = word.replace(/Y$/, 'i'); + } else if(word.search(/ç$/) != -1){ + word = word.replace(/ç$/, 'c'); + } + } else { + /* Step 4 */ + //If the word ends s, not preceded by a, i, o, u, è or s, delete it. + if (word.search(/([^aiouès])s$/) >= rvIndex) { + word = word.replace(/([^aiouès])s$/, '$1'); + } + var e1Index = word.search(/ion$/); + if (e1Index >= r2Index && word.search(/[st]ion$/) >= rvIndex) { + word = word.substring(0, e1Index); + } else { + var e2Index = word.search(/(ier|ière|Ier|Ière)$/); + if (e2Index != -1 && e2Index >= rvIndex) { + word = word.substring(0, e2Index) + "i"; + } else { + if (word.search(/e$/) >= rvIndex) { + word = word.replace(/e$/, ''); //delete last e + } else if (word.search(/guë$/) >= rvIndex) { + word = word.replace(/guë$/, 'gu'); + } + } + } + } + + /* Step 5: Undouble */ + //word = word.replace(/(en|on|et|el|eil)(n|t|l)$/,'$1'); + word = word.replace(/(en|on)(n)$/,'$1'); + word = word.replace(/(ett)$/,'et'); + word = word.replace(/(el|eil)(l)$/,'$1'); + + /* Step 6: Un-accent */ + word = word.replace(/[éè]([^aeiouyâàëéêèïîôûù]+)$/,'e$1'); + word = word.toLowerCase(); + return word; +}; + +var eqOut = new Array(); +var noteqOut = new Array(); +var eqCount = 0; +/* +To test the stemming, create two arrays named "voc" and "COut" which are for vocabualary and the stemmed output. +Then add the vocabulary strings and output strings. This method will generate the stemmed output for "voc" and will +compare the output with COut. + (I used porter's voc and out files and did a regex to convert them to js objects. regex: /");\nvoc.push("/g . This + will add strings to voc array such that output would look like: voc.push("foobar"); ) drop me an email for any help. + */ +function testFr(){ + var start = new Date().getTime(); //execution time + eqCount = 0; + eqOut = new Array(); + noteqOut = new Array(); + for(var k=0;k