]> granicus.if.org Git - docbook-dsssl/commitdiff
Added Chinese, Japanese, Korean (CJK) support for better searching. Uses Lucene's...
authorKasun Gajasinghe <kasunbg@gmail.com>
Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
committerKasun Gajasinghe <kasunbg@gmail.com>
Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
New classpath attr in build.xml to reflect new additions.
Few fixes to client-side js files for better use of new languages(no-stem langs)

xsl/webhelp/build.xml
xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar [new file with mode: 0644]
xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar [new file with mode: 0644]
xsl/webhelp/indexer/lib/nw-cms.jar
xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
xsl/webhelp/template/common/main.js
xsl/webhelp/template/content/search/nwSearchFnt.js

index ac067b22b185897f58e58d88771067b0ace7dd94..6d689702ac2bb9bd38c7c0176c381a4753394dfa 100755 (executable)
   <property environment="env"/>
   <property name="ant.jar" value="${env.ANT_HOME}/lib/ant.jar"/>
 
+    <path id="classpath">
+        <pathelement location="${ant.file.dir}/indexer/lib/nw-cms.jar"/>
+        <pathelement location="${ant.file.dir}/indexer/lib/lucene-analyzers-3.0.0.jar"/>
+        <pathelement location="${ant.file.dir}/indexer/lib/lucene-core-3.0.0.jar"/>
+        <pathelement path="${ant.jar}"/>  
+    </path>
+
   <condition property="perform-validation">
        <equals arg1="${validate}" arg2="true"/>
   </condition>
                <include name="**/content/search/stemmers/${indexer-language}_stemmer.js"/>
          </fileset>
        </copy>
-
-       <path id="nw-cms.jar.path">
-         <pathelement location="${ant.file.dir}/indexer/lib/nw-cms.jar"/>
-       </path>
-
        <taskdef name="indexertask"
          classname="com.nexwave.nquindexer.IndexerTask">
-         <classpath refid="nw-cms.jar.path"/>
+         <classpath refid="classpath"/>
        </taskdef>
 
        <echo>Indexing html files in ${output-dir}/content</echo>
 
   <target name="build-indexer">
 
+    <mkdir dir="indexer/lib/htmlsearch"/>  
        <javac
          srcdir="indexer/src"
-         destdir="indexer/lib"
-         includes="com/nexwave/nsidita/*.java com/nexwave/nquindexer/*.java"
-         classpath="${ant.jar}"/>
+         destdir="indexer/lib/htmlsearch"
+         includes="com/nexwave/nsidita/*.java com/nexwave/nquindexer/*.java">
+        <classpath refid="classpath"/>    
+    </javac>
 
        <jar
          destfile="indexer/lib/nw-cms.jar"
-         basedir="indexer/lib"
+         basedir="indexer/lib/htmlsearch"
          includes="com/**"/>
 
-       <delete dir="indexer/lib/com"/>
+       <delete dir="indexer/lib/htmlsearch"/>
 
   </target>
 
diff --git a/xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar b/xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar
new file mode 100644 (file)
index 0000000..9f26ecf
Binary files /dev/null and b/xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar differ
diff --git a/xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar b/xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar
new file mode 100644 (file)
index 0000000..38d78c3
Binary files /dev/null and b/xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar differ
index fb52693bb53917e8191954dc917af92af9f7a8f8..886f2eb30dad8d58b9b94770b062d19b7fa2404b 100755 (executable)
Binary files a/xsl/webhelp/indexer/lib/nw-cms.jar and b/xsl/webhelp/indexer/lib/nw-cms.jar differ
index 416e27fd3010e9a2c3c86a98a7299724b0960055..2a347d3f5fb619242952c96cd6cf211764825e74 100755 (executable)
@@ -55,7 +55,8 @@ public class IndexerTask{
 \r
     //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,\r
     // as stemmers doesn't find a difference between them.\r
-    private String[] supportedLanguages= {"en", "de"}; //currently English & German only. Chinese(cn) to be added shortly.\r
+    private String[] supportedLanguages= {"en", "de", "cn", "ja", "ko"}; //currently extended support available for\r
+                                                    // English, German, and CJK (Chinese, Japanese, Korean) languages only.\r
 \r
        // Indexing features: words to remove\r
        private ArrayList<String> cleanUpStrings = null;        \r
@@ -96,8 +97,9 @@ public class IndexerTask{
             \r
             //if not in supported language list,\r
             if(i>=supportedLanguages.length){\r
-                System.out.println("The given language, \""+indexerLanguage+"\", is not supported or specified in a bad format. " +\r
-                        "Check documentation for details. Language now defaults to english.");\r
+                System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " +\r
+                        "searching or language code is specified in a bad format. Check documentation for details. " +\r
+                        "Language now defaults to english.");\r
                 this.indexerLanguage = "en";\r
             } \r
         } else {\r
@@ -198,7 +200,7 @@ public class IndexerTask{
                \r
                // Retrieve the clean-up properties for indexing\r
                RetrieveCleanUpProps();\r
-           System.out.print("clean"+" " +cleanUpStrings);\r
+          // System.out.print("clean"+" " +cleanUpStrings);\r
            \r
                //create a default handler\r
                //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
index 606dac5a27657014b39537c53d28a2a2d4faefdf..a83453d9615ae77d55e6343b8d2e8260b17b52e1 100755 (executable)
@@ -1,7 +1,10 @@
 package com.nexwave.nquindexer;\r
 \r
 import java.io.File;\r
+import java.io.IOException;\r
+import java.io.Reader;\r
 import java.util.*;\r
+import java.io.StringReader;\r
 \r
 // specific dita ot\r
 import com.nexwave.nsidita.DocFileInfo;\r
@@ -11,13 +14,21 @@ import com.nexwave.stemmer.snowball.SnowballStemmer;
 import com.nexwave.stemmer.snowball.ext.EnglishStemmer;\r
 import com.nexwave.stemmer.snowball.ext.GermanStemmer;\r
 \r
+//CJK Tokenizing\r
+import org.apache.lucene.analysis.Token;\r
+import org.apache.lucene.analysis.TokenStream;\r
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;\r
+import org.apache.lucene.analysis.cjk.CJKTokenizer;\r
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\r
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;\r
+\r
 \r
 /**\r
  * Parser for the html files generated by DITA-OT.\r
  * Extracts the title, the shortdesc and the text within the "content" div tag. <div id="content">\r
- * \r
+ *\r
  * @version 1.1 2010\r
- * \r
+ *\r
  * @author N. Quaine\r
  * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
  */\r
@@ -52,18 +63,18 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                cleanUpList = cleanUpStrings;\r
                cleanUpPunctuation = cleanUpChars;\r
        }\r
-       \r
+\r
        /**\r
         * Initializer\r
         */\r
        public int init(Map<String,String> tempMap){\r
                tempDico = tempMap;\r
-               return 0;       \r
+               return 0;\r
        }\r
 \r
        /**\r
-        * Parses the file to extract all the words for indexing and \r
-        * some data characterizing the file. \r
+        * Parses the file to extract all the words for indexing and\r
+        * some data characterizing the file.\r
         * @param file contains the fullpath of the document to parse\r
      * @param indexerLanguage this will be used to tell the program which stemmer to be used.\r
         * @return a DitaFileInfo object filled with data describing the file\r
@@ -72,10 +83,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                //initialization\r
                fileDesc = new DocFileInfo(file);\r
                strbf = new StringBuffer("");\r
-               \r
+\r
                // Fill strbf by parsing the file\r
                parseDocument(file);\r
-               \r
+\r
                String str = cleanBuffer(strbf);\r
         str = str.replaceAll("\\s+"," ");   //there's still redundant spaces in the middle\r
 //             System.out.println(file.toString()+" "+ str +"\n");\r
@@ -84,34 +95,60 @@ public class SaxHTMLIndex extends SaxDocFileParser{
         //get items one-by-one, tunnel through the stemmer, and get the stem.\r
         //Then, add them to tempSet\r
         //Do Stemming for words in items\r
-        //TODO currently, stemming support is for english only. Add support for other languages as well.\r
\r
-        SnowballStemmer stemmer;\r
-        if(indexerLanguage.equals("en")){\r
-             stemmer = new EnglishStemmer();\r
-        } else if (indexerLanguage.equals("de")){\r
-            stemmer= new GermanStemmer();\r
+        //TODO currently, stemming support is for english and german only. Add support for other languages as well.\r
+\r
+        String[] tokenizedItems;\r
+        if(indexerLanguage.equalsIgnoreCase("jp") || indexerLanguage.equalsIgnoreCase("cn")\r
+                || indexerLanguage.equalsIgnoreCase("ko")){\r
+                LinkedList<String> tokens = new LinkedList<String>();\r
+            try{\r
+                CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
+                Reader reader = new StringReader(str);\r
+                TokenStream stream = analyzer.tokenStream("", reader);\r
+                TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);\r
+                OffsetAttribute offAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);\r
+\r
+                while (stream.incrementToken()) {\r
+                    String term = termAtt.term();\r
+                    tokens.add(term);\r
+//                    System.out.println(term + " " + offAtt.startOffset() + " " + offAtt.endOffset());\r
+                }\r
+\r
+                tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
+\r
+            }catch (IOException ex){\r
+                tokenizedItems = items;\r
+                System.out.println("Error tokenizing content using CJK Analyzer. IOException");\r
+                ex.printStackTrace();\r
+            }\r
+\r
         } else {\r
-            stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+            SnowballStemmer stemmer;\r
+            if(indexerLanguage.equalsIgnoreCase("en")){\r
+                 stemmer = new EnglishStemmer();\r
+            } else if (indexerLanguage.equalsIgnoreCase("de")){\r
+                stemmer= new GermanStemmer();\r
+            } else {\r
+                stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+            }\r
+            if(stemmer != null)             //If a stemmer available\r
+                tokenizedItems = stemmer.doStem(items);\r
+            else                            //if no stemmer available for the particular language\r
+                tokenizedItems = items;\r
+\r
         }\r
 \r
-        String[] stemmedItems;\r
-        if(stemmer != null)             //If a stemmer available\r
-            stemmedItems  = stemmer.doStem(items);\r
-        else                            //if no stemmer available for the particular language\r
-            stemmedItems = items;\r
\r
-       /* for(String stemmedItem: stemmedItems){\r
+       /* for(String stemmedItem: tokenizedItems){\r
             System.out.print(stemmedItem+"| ");\r
         }*/\r
-        \r
+\r
                //items: remove the duplicated strings first\r
                HashSet <String> tempSet = new HashSet<String>();\r
-        tempSet.addAll(Arrays.asList(stemmedItems));\r
+        tempSet.addAll(Arrays.asList(tokenizedItems));\r
                Iterator it = tempSet.iterator();\r
                String s;\r
         while (it.hasNext()) {\r
-               s = (String)it.next(); \r
+               s = (String)it.next();\r
                if (tempDico.containsKey(s)) {\r
                        String temp = tempDico.get(s);\r
                        temp = temp.concat(",").concat(Integer.toString(i));\r
@@ -121,22 +158,22 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                        tempDico.put(s, Integer.toString(i));\r
                }\r
         }\r
-        \r
+\r
         i++;\r
                return fileDesc;\r
        }\r
 \r
        /**\r
-        * Cleans the string buffer containing all the text retrieved from \r
-        * the html file:  remove punctuation, clean white spaces, remove the words \r
+        * Cleans the string buffer containing all the text retrieved from\r
+        * the html file:  remove punctuation, clean white spaces, remove the words\r
         * which you do not want to index.\r
         * NOTE: You may customize this function:\r
-        * This version takes into account english and japanese. Depending on your \r
-        * needs, \r
-        * you may have to add/remove some characters/words through props files \r
+        * This version takes into account english and japanese. Depending on your\r
+        * needs,\r
+        * you may have to add/remove some characters/words through props files\r
         *    or by modifying tte default code,\r
-        * you may want to separate the language processing (doc only in japanese, \r
-        * doc only in english, check the language metadata ...). \r
+        * you may want to separate the language processing (doc only in japanese,\r
+        * doc only in english, check the language metadata ...).\r
         */\r
        private String cleanBuffer (StringBuffer strbf) {\r
                String str = strbf.toString().toLowerCase();\r
@@ -144,7 +181,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                StringBuffer tempCharBuf = new StringBuffer("");\r
                if ((cleanUpList == null) || (cleanUpList.isEmpty())){\r
                        // Default clean-up\r
-                       \r
+\r
                        // Should perhaps eliminate the words at the end of the table?\r
                        tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
                        tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
@@ -155,11 +192,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{
             tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");\r
 \r
                        str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " ");\r
-                       \r
+\r
 \r
                        //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
                        // remove contiguous white charaters\r
-                       //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");                     \r
+                       //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
                }else {\r
                        // Clean-up using the props files\r
                        tempStrBuf.append("\\ba\\b");\r
@@ -179,10 +216,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                str = minimalClean(str, tempStrBuf, tempCharBuf);\r
                return str;\r
        }\r
-       \r
+\r
        private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
                String tempPunctuation = new String(tempCharBuf);\r
-               \r
+\r
                str = str.replaceAll("\\s+", " ");\r
                str = str.replaceAll("->", " ");\r
                str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
@@ -197,13 +234,13 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 \r
                //remove useless words\r
                str = str.replaceAll(tempStrBuf.toString(), " ");\r
-               \r
+\r
                // Redo punctuation after removing some words: (TODO: useful?)\r
                str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
                str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
                str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
                str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");             \r
+               str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
                if (tempPunctuation.length() > 0)\r
                {\r
                        str = str.replaceAll(tempPunctuation, " ");\r
index 45e306b99fe67b53266f47258343e1b77c3ebb11..c7ecfde971d4c2823d13c790a3eb417a22b12d6f 100755 (executable)
@@ -131,10 +131,14 @@ function searchHighlight(searchText) {
         wList = searchText.split(" ");
         $("#content").highlight(wList); //Highlight the search input
 
-        //Highlight the stems
-        for (var i = 0; i < wList.length; i++) {
-            var stemW = stemmer(wList[i]);
-            sList.push(stemW);
+        if(typeof stemmer != "undefined" ){
+            //Highlight the stems
+            for (var i = 0; i < wList.length; i++) {
+                var stemW = stemmer(wList[i]);
+                sList.push(stemW);
+            }
+        } else {
+            sList = wList;
         }
         $("#content").highlight(sList); //Highlight the search input's all stems
     } 
index ebcb53cb361d26c52b9e0d676cef83bb0f422c68..18206946c632501bc25fb3c287b0b0b35538cc54 100755 (executable)
@@ -82,7 +82,11 @@ function Effectuer_recherche(expressionInput) {
 \r
     for(var j in wordsList){\r
         var word = wordsList[j];\r
-        stemQueryMap[stemmer(word)] = word;\r
+        if(typeof stemmer != "undefined" ){\r
+            stemQueryMap[stemmer(word)] = word;\r
+        } else {\r
+            stemQueryMap[word] = word;\r
+        }\r
     }\r
 \r
      //stemmedWordsList is the stemmed list of words separated by spaces.\r
@@ -94,10 +98,14 @@ function Effectuer_recherche(expressionInput) {
         }\r
     }\r
 \r
-    //Do the stemming using Porter's stemming algorithm\r
-    for (var i = 0; i < cleanwordsList.length; i++) {\r
-        var stemWord = stemmer(cleanwordsList[i]);\r
-        stemmedWordsList.push(stemWord);\r
+    if(typeof stemmer != "undefined" ){\r
+        //Do the stemming using Porter's stemming algorithm\r
+        for (var i = 0; i < cleanwordsList.length; i++) {\r
+            var stemWord = stemmer(cleanwordsList[i]);\r
+            stemmedWordsList.push(stemWord);\r
+        }\r
+    } else {\r
+        stemmedWordsList = cleanwordsList;\r
     }\r
 \r
     //load the scripts with the indices: the following lines do not work on the server. To be corrected\r