Added Chinese, Japanese, Korean (CJK) support for better searching. Uses Lucene's...

author Kasun Gajasinghe <kasunbg@gmail.com>

Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)

committer Kasun Gajasinghe <kasunbg@gmail.com>

Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
author Kasun Gajasinghe <kasunbg@gmail.com>
Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
committer Kasun Gajasinghe <kasunbg@gmail.com>
Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
diff --git a/xsl/webhelp/build.xml b/xsl/webhelp/build.xml

index ac067b22b185897f58e58d88771067b0ace7dd94..6d689702ac2bb9bd38c7c0176c381a4753394dfa 100755 (executable)
--- a/xsl/webhelp/build.xml
+++ b/xsl/webhelp/build.xml
@@ -13,6 +13,13 @@
    <property environment="env"/>
    <property name="ant.jar" value="${env.ANT_HOME}/lib/ant.jar"/>
  
+    <path id="classpath">
+        <pathelement location="${ant.file.dir}/indexer/lib/nw-cms.jar"/>
+        <pathelement location="${ant.file.dir}/indexer/lib/lucene-analyzers-3.0.0.jar"/>
+        <pathelement location="${ant.file.dir}/indexer/lib/lucene-core-3.0.0.jar"/>
+        <pathelement path="${ant.jar}"/>  
+    </path>
+
    <condition property="perform-validation">
         <equals arg1="${validate}" arg2="true"/>
    </condition>
@@ -84,14 +91,10 @@
                 <include name="**/content/search/stemmers/${indexer-language}_stemmer.js"/>
           </fileset>
         </copy>
-
-       <path id="nw-cms.jar.path">
-         <pathelement location="${ant.file.dir}/indexer/lib/nw-cms.jar"/>
-       </path>
-
+ 
         <taskdef name="indexertask"
           classname="com.nexwave.nquindexer.IndexerTask">
-         <classpath refid="nw-cms.jar.path"/>
+         <classpath refid="classpath"/>
         </taskdef>
  
         <echo>Indexing html files in ${output-dir}/content</echo>
@@ -110,18 +113,20 @@
  
    <target name="build-indexer">
  
+    <mkdir dir="indexer/lib/htmlsearch"/>  
         <javac
           srcdir="indexer/src"
-         destdir="indexer/lib"
-         includes="com/nexwave/nsidita/*.java com/nexwave/nquindexer/*.java"
-         classpath="${ant.jar}"/>
+         destdir="indexer/lib/htmlsearch"
+         includes="com/nexwave/nsidita/*.java com/nexwave/nquindexer/*.java">
+        <classpath refid="classpath"/>    
+    </javac>
  
         <jar
           destfile="indexer/lib/nw-cms.jar"
-         basedir="indexer/lib"
+         basedir="indexer/lib/htmlsearch"
           includes="com/**"/>
  
-       <delete dir="indexer/lib/com"/>
+       <delete dir="indexer/lib/htmlsearch"/>
  
    </target>
  
diff --git a/xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar b/xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar

new file mode 100644 (file)

index 0000000..9f26ecf

Binary files /dev/null and b/xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar differ
diff --git a/xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar b/xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar

new file mode 100644 (file)

index 0000000..38d78c3

Binary files /dev/null and b/xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar differ
diff --git a/xsl/webhelp/indexer/lib/nw-cms.jar b/xsl/webhelp/indexer/lib/nw-cms.jar

index fb52693bb53917e8191954dc917af92af9f7a8f8..886f2eb30dad8d58b9b94770b062d19b7fa2404b 100755 (executable)

Binary files a/xsl/webhelp/indexer/lib/nw-cms.jar and b/xsl/webhelp/indexer/lib/nw-cms.jar differ
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java

index 416e27fd3010e9a2c3c86a98a7299724b0960055..2a347d3f5fb619242952c96cd6cf211764825e74 100755 (executable)
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
@@ -55,7 +55,8 @@ public class IndexerTask{
  \r
      //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,\r
      // as stemmers doesn't find a difference between them.\r
-    private String[] supportedLanguages= {"en", "de"}; //currently English & German only. Chinese(cn) to be added shortly.\r
+    private String[] supportedLanguages= {"en", "de", "cn", "ja", "ko"}; //currently extended support available for\r
+                                                    // English, German, and CJK (Chinese, Japanese, Korean) languages only.\r
  \r
         // Indexing features: words to remove\r
         private ArrayList<String> cleanUpStrings = null;        \r
@@ -96,8 +97,9 @@ public class IndexerTask{
              \r
              //if not in supported language list,\r
              if(i>=supportedLanguages.length){\r
-                System.out.println("The given language, \""+indexerLanguage+"\", is not supported or specified in a bad format. " +\r
-                        "Check documentation for details. Language now defaults to english.");\r
+                System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " +\r
+                        "searching or language code is specified in a bad format. Check documentation for details. " +\r
+                        "Language now defaults to english.");\r
                  this.indexerLanguage = "en";\r
              } \r
          } else {\r
@@ -198,7 +200,7 @@ public class IndexerTask{
                 \r
                 // Retrieve the clean-up properties for indexing\r
                 RetrieveCleanUpProps();\r
-           System.out.print("clean"+" " +cleanUpStrings);\r
+          // System.out.print("clean"+" " +cleanUpStrings);\r
             \r
                 //create a default handler\r
                 //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java

index 606dac5a27657014b39537c53d28a2a2d4faefdf..a83453d9615ae77d55e6343b8d2e8260b17b52e1 100755 (executable)
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
@@ -1,7 +1,10 @@
  package com.nexwave.nquindexer;\r
  \r
  import java.io.File;\r
+import java.io.IOException;\r
+import java.io.Reader;\r
  import java.util.*;\r
+import java.io.StringReader;\r
  \r
  // specific dita ot\r
  import com.nexwave.nsidita.DocFileInfo;\r
@@ -11,13 +14,21 @@ import com.nexwave.stemmer.snowball.SnowballStemmer;
  import com.nexwave.stemmer.snowball.ext.EnglishStemmer;\r
  import com.nexwave.stemmer.snowball.ext.GermanStemmer;\r
  \r
+//CJK Tokenizing\r
+import org.apache.lucene.analysis.Token;\r
+import org.apache.lucene.analysis.TokenStream;\r
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;\r
+import org.apache.lucene.analysis.cjk.CJKTokenizer;\r
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\r
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;\r
+\r
  \r
  /**\r
   * Parser for the html files generated by DITA-OT.\r
   * Extracts the title, the shortdesc and the text within the "content" div tag. <div id="content">\r
- * \r
+ *\r
   * @version 1.1 2010\r
- * \r
+ *\r
   * @author N. Quaine\r
   * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
   */\r
@@ -52,18 +63,18 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 cleanUpList = cleanUpStrings;\r
                 cleanUpPunctuation = cleanUpChars;\r
         }\r
-       \r
+\r
         /**\r
          * Initializer\r
          */\r
         public int init(Map<String,String> tempMap){\r
                 tempDico = tempMap;\r
-               return 0;       \r
+               return 0;\r
         }\r
  \r
         /**\r
-        * Parses the file to extract all the words for indexing and \r
-        * some data characterizing the file. \r
+        * Parses the file to extract all the words for indexing and\r
+        * some data characterizing the file.\r
          * @param file contains the fullpath of the document to parse\r
       * @param indexerLanguage this will be used to tell the program which stemmer to be used.\r
          * @return a DitaFileInfo object filled with data describing the file\r
@@ -72,10 +83,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 //initialization\r
                 fileDesc = new DocFileInfo(file);\r
                 strbf = new StringBuffer("");\r
-               \r
+\r
                 // Fill strbf by parsing the file\r
                 parseDocument(file);\r
-               \r
+\r
                 String str = cleanBuffer(strbf);\r
          str = str.replaceAll("\\s+"," ");   //there's still redundant spaces in the middle\r
  //             System.out.println(file.toString()+" "+ str +"\n");\r
@@ -84,34 +95,60 @@ public class SaxHTMLIndex extends SaxDocFileParser{
          //get items one-by-one, tunnel through the stemmer, and get the stem.\r
          //Then, add them to tempSet\r
          //Do Stemming for words in items\r
-        //TODO currently, stemming support is for english only. Add support for other languages as well.\r
- \r
-        SnowballStemmer stemmer;\r
-        if(indexerLanguage.equals("en")){\r
-             stemmer = new EnglishStemmer();\r
-        } else if (indexerLanguage.equals("de")){\r
-            stemmer= new GermanStemmer();\r
+        //TODO currently, stemming support is for english and german only. Add support for other languages as well.\r
+\r
+        String[] tokenizedItems;\r
+        if(indexerLanguage.equalsIgnoreCase("jp") || indexerLanguage.equalsIgnoreCase("cn")\r
+                || indexerLanguage.equalsIgnoreCase("ko")){\r
+                LinkedList<String> tokens = new LinkedList<String>();\r
+            try{\r
+                CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
+                Reader reader = new StringReader(str);\r
+                TokenStream stream = analyzer.tokenStream("", reader);\r
+                TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);\r
+                OffsetAttribute offAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);\r
+\r
+                while (stream.incrementToken()) {\r
+                    String term = termAtt.term();\r
+                    tokens.add(term);\r
+//                    System.out.println(term + " " + offAtt.startOffset() + " " + offAtt.endOffset());\r
+                }\r
+\r
+                tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
+\r
+            }catch (IOException ex){\r
+                tokenizedItems = items;\r
+                System.out.println("Error tokenizing content using CJK Analyzer. IOException");\r
+                ex.printStackTrace();\r
+            }\r
+\r
          } else {\r
-            stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+            SnowballStemmer stemmer;\r
+            if(indexerLanguage.equalsIgnoreCase("en")){\r
+                 stemmer = new EnglishStemmer();\r
+            } else if (indexerLanguage.equalsIgnoreCase("de")){\r
+                stemmer= new GermanStemmer();\r
+            } else {\r
+                stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+            }\r
+            if(stemmer != null)             //If a stemmer available\r
+                tokenizedItems = stemmer.doStem(items);\r
+            else                            //if no stemmer available for the particular language\r
+                tokenizedItems = items;\r
+\r
          }\r
  \r
-        String[] stemmedItems;\r
-        if(stemmer != null)             //If a stemmer available\r
-            stemmedItems  = stemmer.doStem(items);\r
-        else                            //if no stemmer available for the particular language\r
-            stemmedItems = items;\r
- \r
-       /* for(String stemmedItem: stemmedItems){\r
+       /* for(String stemmedItem: tokenizedItems){\r
              System.out.print(stemmedItem+"| ");\r
          }*/\r
-        \r
+\r
                 //items: remove the duplicated strings first\r
                 HashSet <String> tempSet = new HashSet<String>();\r
-        tempSet.addAll(Arrays.asList(stemmedItems));\r
+        tempSet.addAll(Arrays.asList(tokenizedItems));\r
                 Iterator it = tempSet.iterator();\r
                 String s;\r
          while (it.hasNext()) {\r
-               s = (String)it.next(); \r
+               s = (String)it.next();\r
                 if (tempDico.containsKey(s)) {\r
                         String temp = tempDico.get(s);\r
                         temp = temp.concat(",").concat(Integer.toString(i));\r
@@ -121,22 +158,22 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                         tempDico.put(s, Integer.toString(i));\r
                 }\r
          }\r
-        \r
+\r
          i++;\r
                 return fileDesc;\r
         }\r
  \r
         /**\r
-        * Cleans the string buffer containing all the text retrieved from \r
-        * the html file:  remove punctuation, clean white spaces, remove the words \r
+        * Cleans the string buffer containing all the text retrieved from\r
+        * the html file:  remove punctuation, clean white spaces, remove the words\r
          * which you do not want to index.\r
          * NOTE: You may customize this function:\r
-        * This version takes into account english and japanese. Depending on your \r
-        * needs, \r
-        * you may have to add/remove some characters/words through props files \r
+        * This version takes into account english and japanese. Depending on your\r
+        * needs,\r
+        * you may have to add/remove some characters/words through props files\r
          *    or by modifying tte default code,\r
-        * you may want to separate the language processing (doc only in japanese, \r
-        * doc only in english, check the language metadata ...). \r
+        * you may want to separate the language processing (doc only in japanese,\r
+        * doc only in english, check the language metadata ...).\r
          */\r
         private String cleanBuffer (StringBuffer strbf) {\r
                 String str = strbf.toString().toLowerCase();\r
@@ -144,7 +181,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 StringBuffer tempCharBuf = new StringBuffer("");\r
                 if ((cleanUpList == null) || (cleanUpList.isEmpty())){\r
                         // Default clean-up\r
-                       \r
+\r
                         // Should perhaps eliminate the words at the end of the table?\r
                         tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
                         tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
@@ -155,11 +192,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{
              tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");\r
  \r
                         str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " ");\r
-                       \r
+\r
  \r
                         //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
                         // remove contiguous white charaters\r
-                       //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");                     \r
+                       //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
                 }else {\r
                         // Clean-up using the props files\r
                         tempStrBuf.append("\\ba\\b");\r
@@ -179,10 +216,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 str = minimalClean(str, tempStrBuf, tempCharBuf);\r
                 return str;\r
         }\r
-       \r
+\r
         private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
                 String tempPunctuation = new String(tempCharBuf);\r
-               \r
+\r
                 str = str.replaceAll("\\s+", " ");\r
                 str = str.replaceAll("->", " ");\r
                 str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
@@ -197,13 +234,13 @@ public class SaxHTMLIndex extends SaxDocFileParser{
  \r
                 //remove useless words\r
                 str = str.replaceAll(tempStrBuf.toString(), " ");\r
-               \r
+\r
                 // Redo punctuation after removing some words: (TODO: useful?)\r
                 str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
                 str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
                 str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
                 str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");             \r
+               str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
                 if (tempPunctuation.length() > 0)\r
                 {\r
                         str = str.replaceAll(tempPunctuation, " ");\r
diff --git a/xsl/webhelp/template/common/main.js b/xsl/webhelp/template/common/main.js

index 45e306b99fe67b53266f47258343e1b77c3ebb11..c7ecfde971d4c2823d13c790a3eb417a22b12d6f 100755 (executable)
--- a/xsl/webhelp/template/common/main.js
+++ b/xsl/webhelp/template/common/main.js
@@ -131,10 +131,14 @@ function searchHighlight(searchText) {
          wList = searchText.split(" ");
          $("#content").highlight(wList); //Highlight the search input
  
-        //Highlight the stems
-        for (var i = 0; i < wList.length; i++) {
-            var stemW = stemmer(wList[i]);
-            sList.push(stemW);
+        if(typeof stemmer != "undefined" ){
+            //Highlight the stems
+            for (var i = 0; i < wList.length; i++) {
+                var stemW = stemmer(wList[i]);
+                sList.push(stemW);
+            }
+        } else {
+            sList = wList;
          }
          $("#content").highlight(sList); //Highlight the search input's all stems
      } 
diff --git a/xsl/webhelp/template/content/search/nwSearchFnt.js b/xsl/webhelp/template/content/search/nwSearchFnt.js

index ebcb53cb361d26c52b9e0d676cef83bb0f422c68..18206946c632501bc25fb3c287b0b0b35538cc54 100755 (executable)
--- a/xsl/webhelp/template/content/search/nwSearchFnt.js
+++ b/xsl/webhelp/template/content/search/nwSearchFnt.js
@@ -82,7 +82,11 @@ function Effectuer_recherche(expressionInput) {
  \r
      for(var j in wordsList){\r
          var word = wordsList[j];\r
-        stemQueryMap[stemmer(word)] = word;\r
+        if(typeof stemmer != "undefined" ){\r
+            stemQueryMap[stemmer(word)] = word;\r
+        } else {\r
+            stemQueryMap[word] = word;\r
+        }\r
      }\r
  \r
       //stemmedWordsList is the stemmed list of words separated by spaces.\r
@@ -94,10 +98,14 @@ function Effectuer_recherche(expressionInput) {
          }\r
      }\r
  \r
-    //Do the stemming using Porter's stemming algorithm\r
-    for (var i = 0; i < cleanwordsList.length; i++) {\r
-        var stemWord = stemmer(cleanwordsList[i]);\r
-        stemmedWordsList.push(stemWord);\r
+    if(typeof stemmer != "undefined" ){\r
+        //Do the stemming using Porter's stemming algorithm\r
+        for (var i = 0; i < cleanwordsList.length; i++) {\r
+            var stemWord = stemmer(cleanwordsList[i]);\r
+            stemmedWordsList.push(stemWord);\r
+        }\r
+    } else {\r
+        stemmedWordsList = cleanwordsList;\r
      }\r
  \r
      //load the scripts with the indices: the following lines do not work on the server. To be corrected\r
author	Kasun Gajasinghe <kasunbg@gmail.com>
	Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
committer	Kasun Gajasinghe <kasunbg@gmail.com>
	Sat, 7 Aug 2010 09:08:00 +0000 (09:08 +0000)
xsl/webhelp/build.xml		patch \| blob \| history
xsl/webhelp/indexer/lib/lucene-analyzers-3.0.0.jar	[new file with mode: 0644]	patch \| blob
xsl/webhelp/indexer/lib/lucene-core-3.0.0.jar	[new file with mode: 0644]	patch \| blob
xsl/webhelp/indexer/lib/nw-cms.jar		patch \| blob \| history
xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java		patch \| blob \| history
xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java		patch \| blob \| history
xsl/webhelp/template/common/main.js		patch \| blob \| history
xsl/webhelp/template/content/search/nwSearchFnt.js		patch \| blob \| history