Added client-side support for cjk searching. By default, for cjk, 2-gram tokenizing...

author Kasun Gajasinghe <kasunbg@gmail.com>

Sun, 8 Aug 2010 16:43:13 +0000 (16:43 +0000)

committer Kasun Gajasinghe <kasunbg@gmail.com>

Sun, 8 Aug 2010 16:43:13 +0000 (16:43 +0000)
author Kasun Gajasinghe <kasunbg@gmail.com>
Sun, 8 Aug 2010 16:43:13 +0000 (16:43 +0000)
committer Kasun Gajasinghe <kasunbg@gmail.com>
Sun, 8 Aug 2010 16:43:13 +0000 (16:43 +0000)
diff --git a/xsl/webhelp/indexer/lib/nw-cms.jar b/xsl/webhelp/indexer/lib/nw-cms.jar

index 886f2eb30dad8d58b9b94770b062d19b7fa2404b..95bedc392491336af3144534d43976e8a7132723 100755 (executable)

Binary files a/xsl/webhelp/indexer/lib/nw-cms.jar and b/xsl/webhelp/indexer/lib/nw-cms.jar differ
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java

index 2a347d3f5fb619242952c96cd6cf211764825e74..cfde683424f90575951a332a43e135a1be49404f 100755 (executable)
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
@@ -51,7 +51,7 @@ public class IndexerTask{
  \r
         // ANT parameters\r
         private String htmldir=null;\r
-    private String indexerLanguage="en";\r
+    public static String indexerLanguage="en";\r
  \r
      //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,\r
      // as stemmers doesn't find a difference between them.\r
@@ -90,7 +90,7 @@ public class IndexerTask{
              int i=0;\r
              for (;i<supportedLanguages.length;i++) {\r
                  if(indexerLanguage.equals(supportedLanguages[i])){\r
-                    this.indexerLanguage = supportedLanguages[i];\r
+                    IndexerTask.indexerLanguage = supportedLanguages[i];\r
                      break;\r
                  }\r
              }\r
@@ -100,10 +100,10 @@ public class IndexerTask{
                  System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " +\r
                          "searching or language code is specified in a bad format. Check documentation for details. " +\r
                          "Language now defaults to english.");\r
-                this.indexerLanguage = "en";\r
+                IndexerTask.indexerLanguage = "en";\r
              } \r
          } else {\r
-            this.indexerLanguage = "en";\r
+            IndexerTask.indexerLanguage = "en";\r
          } \r
      }\r
  \r
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java

index 5c6c45166e4d611491f3594b4cb209173e56d0b1..985d2080dc64bc2f728773c129c0171a6a887ae1 100755 (executable)
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java
@@ -182,6 +182,8 @@ public class WriteJSFiles {
                        The value is the numbers of the files in which the word exists.\r
                        Example: w["key"]="file1,file2,file3";*/\r
                      int count = 0;\r
+                    if(i==1)\r
+                        out.write("var indexerLanguage=\""+IndexerTask.indexerLanguage+"\";\n");\r
                      out.write("//Auto generated index for searching.\n");\r
                      while (keyIt.hasNext()) {        //&& (tempLetter == tstr.charAt(0)) \r
                          out.write("w[\"" + tstr + "\"]" + "=\"" + indexMap.get(tstr) + "\";\n");\r
diff --git a/xsl/webhelp/template/content/search/nwSearchFnt.js b/xsl/webhelp/template/content/search/nwSearchFnt.js

index 18206946c632501bc25fb3c287b0b0b35538cc54..fe01240ef9247f3cce593d0b8f9c319cc4589836 100755 (executable)
--- a/xsl/webhelp/template/content/search/nwSearchFnt.js
+++ b/xsl/webhelp/template/content/search/nwSearchFnt.js
@@ -8,11 +8,9 @@
   */\r
  \r
  //string initialization\r
-\r
-\r
-htmlfileList = "htmlFileList.js";\r
-htmlfileinfoList = "htmlFileInfoList.js";\r
-\r
+var htmlfileList = "htmlFileList.js";\r
+var htmlfileinfoList = "htmlFileInfoList.js";\r
+var useCJKTokenizing = false;\r
  \r
  /* Cette fonction verifie la validite de la recherche entrre par l utilisateur */\r
  function Verifie(ditaSearch_Form) {\r
@@ -56,13 +54,12 @@ function Effectuer_recherche(expressionInput) {
      //DisplayWaitingMessage();\r
  \r
      /*data initialisation*/\r
-    searchFor = "";       // expression en lowercase et sans les caracteres speciaux\r
+    searchFor = "";       // expression en lowercase et sans les caracte    res speciaux\r
      //w = new Object();  // hashtable, key=word, value = list of the index of the html files\r
      scriptLetterTab = new scriptfirstchar(); // Array containing the first letter of each word to look for\r
      var scriptsarray = new Array(); // Array with the name of the scripts to load\r
      var wordsList = new Array(); // Array with the words to look for\r
-    var cleanwordsList = new Array(); // Array with the words to look for\r
-    var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces\r
+    var finalWordsList = new Array(); // Array with the words to look for after removing spaces\r
      var listNumerosDesFicStr = "";\r
      var ou_recherche = true;\r
      var linkTab = new Array();\r
@@ -80,32 +77,18 @@ function Effectuer_recherche(expressionInput) {
      wordsList = searchFor.split(" ");\r
      wordsList.sort();\r
  \r
-    for(var j in wordsList){\r
-        var word = wordsList[j];\r
-        if(typeof stemmer != "undefined" ){\r
-            stemQueryMap[stemmer(word)] = word;\r
-        } else {\r
-            stemQueryMap[word] = word;\r
-        }\r
-    }\r
-\r
-     //stemmedWordsList is the stemmed list of words separated by spaces.\r
-    for (t in wordsList) {\r
-        wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "")\r
-        if (wordsList[t] != "%20") {\r
-            scriptLetterTab.add(wordsList[t].charAt(0));\r
-            cleanwordsList.push(wordsList[t]);\r
-        }\r
-    }\r
-\r
-    if(typeof stemmer != "undefined" ){\r
-        //Do the stemming using Porter's stemming algorithm\r
-        for (var i = 0; i < cleanwordsList.length; i++) {\r
-            var stemWord = stemmer(cleanwordsList[i]);\r
-            stemmedWordsList.push(stemWord);\r
-        }\r
+    //set the tokenizing method\r
+    if(typeof indexerLanguage != "undefined" && (indexerLanguage=="cn" || indexerLanguage=="ja" ||indexerLanguage=="ko")){\r
+        useCJKTokenizing=true;\r
      } else {\r
-        stemmedWordsList = cleanwordsList;\r
+        useCJKTokenizing=false;\r
+    }\r
+    //If Lucene CJKTokenizer was used as the indexer, then useCJKTokenizing will be true. Else, do normal tokenizing.\r
+    // 2-gram tokenizinghappens in CJKTokenizing,  \r
+    if(useCJKTokenizing){\r
+        finalWordsList = cjkTokenize(wordsList);\r
+    } else { \r
+        finalWordsList = tokenize(wordsList);\r
      }\r
  \r
      //load the scripts with the indices: the following lines do not work on the server. To be corrected\r
@@ -117,32 +100,29 @@ function Effectuer_recherche(expressionInput) {
       * Compare with the indexed words (in the w[] array), and push words that are in it to tempTab.\r
       */\r
      var tempTab = new Array();\r
-    for (t in stemmedWordsList) {\r
-        if (w[stemmedWordsList[t].toString()] == undefined) {\r
-            txt_wordsnotfound += stemmedWordsList[t] + " ";\r
+    for (t in finalWordsList) {\r
+        if (w[finalWordsList[t].toString()] == undefined) {\r
+            txt_wordsnotfound += finalWordsList[t] + " ";\r
          } else {\r
-            tempTab.push(stemmedWordsList[t]);\r
+            tempTab.push(finalWordsList[t]);\r
          }\r
      }\r
-    stemmedWordsList = tempTab;\r
+    finalWordsList = tempTab;\r
  \r
-    if (stemmedWordsList.length) {\r
+    if (finalWordsList.length) {\r
  \r
-        // recherche 'et' et 'ou' en une fois\r
-        fileAndWordList = SortResults(stemmedWordsList);\r
+        //search 'and' and 'or' one time\r
+        fileAndWordList = SortResults(finalWordsList);\r
  \r
          cpt = fileAndWordList.length;\r
-        for (i = cpt - 1; i >= 0; i--) {\r
+        for (var i = cpt - 1; i >= 0; i--) {\r
              if (fileAndWordList[i] != undefined) {\r
-\r
                  linkTab.push("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + fileAndWordList[i][0].motslisteDisplay + "</span>" + "</p>");\r
  \r
                  linkTab.push("<ul class='searchresult'>");\r
                  for (t in fileAndWordList[i]) {\r
                      //DEBUG: alert(": "+ fileAndWordList[i][t].filenb+" " +fileAndWordList[i][t].motsliste);\r
                      //linkTab.push("<li><a href=\"../"+fl[fileAndWordList[i][t].filenb]+"\">"+fl[fileAndWordList[i][t].filenb]+"</a></li>");\r
-\r
-\r
                      tempInfo = fil[fileAndWordList[i][t].filenb];\r
                      pos1 = tempInfo.indexOf("@@@");\r
                      pos2 = tempInfo.lastIndexOf("@@@");\r
@@ -151,24 +131,21 @@ function Effectuer_recherche(expressionInput) {
                      tempShortdesc = tempInfo.substring(pos2 + 3, tempInfo.length);\r
  \r
                      //file:///home/kasun/docbook/WEBHELP/webhelp-draft-output-format-idea/src/main/resources/web/webhelp/installation.html\r
-                   var linkString = "<li><a href=" + tempPath + ">" + tempTitle + "</a>";\r
-                   // var linkString = "<li><a href=\"installation.html\">" + tempTitle + "</a>";\r
+                    var linkString = "<li><a href=" + tempPath + ">" + tempTitle + "</a>";\r
+                    // var linkString = "<li><a href=\"installation.html\">" + tempTitle + "</a>";\r
                      if ((tempShortdesc != "null")) {\r
                          linkString += "\n<div class=\"shortdesclink\">" + tempShortdesc + "</div>";\r
                      }\r
                      linkString += "</li>";\r
-\r
                      linkTab.push(linkString);\r
-\r
                  }\r
                  linkTab.push("</ul>");\r
              }\r
          }\r
      }\r
  \r
-    var results="";\r
-    if (linkTab.length > 0) {\r
-\r
+    var results = "";\r
+    if (linkTab.length > 0) { \r
          /*writeln ("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">"  + cleanwordsList + "</span>" + "<br/>"+"</p>");*/\r
          results = "<p>";\r
          //write("<ul class='searchresult'>");\r
@@ -176,47 +153,133 @@ function Effectuer_recherche(expressionInput) {
              results += linkTab[t].toString();\r
          }\r
          results += "</p>";\r
-    } else{\r
-         results = "<p>"+"Your search returned no results for "+ "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" +"</p>";\r
+    } else {\r
+        results = "<p>" + "Your search returned no results for " + "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" + "</p>";\r
      }\r
      //alert(results);\r
+    document.getElementById('searchResults').innerHTML = results; \r
+}\r
+\r
+function tokenize(wordsList){\r
+    var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces\r
+    var cleanwordsList = new Array(); // Array with the words to look for\r
+    for(var j in wordsList){\r
+        var word = wordsList[j];\r
+        if(typeof stemmer != "undefined" ){\r
+            stemQueryMap[stemmer(word)] = word;\r
+        } else {\r
+            stemQueryMap[word] = word;\r
+        }\r
+    } \r
+     //stemmedWordsList is the stemmed list of words separated by spaces.\r
+    for (t in wordsList) {\r
+        wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "")\r
+        if (wordsList[t] != "%20") {\r
+            scriptLetterTab.add(wordsList[t].charAt(0));\r
+            cleanwordsList.push(wordsList[t]);\r
+        }\r
+    }\r
+\r
+    if(typeof stemmer != "undefined" ){\r
+        //Do the stemming using Porter's stemming algorithm\r
+        for (var i = 0; i < cleanwordsList.length; i++) {\r
+            var stemWord = stemmer(cleanwordsList[i]);\r
+            stemmedWordsList.push(stemWord);\r
+        }\r
+    } else {\r
+        stemmedWordsList = cleanwordsList;\r
+    }\r
+    return stemmedWordsList;\r
+}\r
+\r
+function cjkTokenize(wordsList){\r
+    var allTokens= new Array();\r
+    var notCJKTokens= new Array();\r
+    var j=0;\r
+    for(j=0;j<wordsList.length;j++){\r
+        var word = wordsList[j];\r
+        if(getAvgAsciiValue(word) < 127){\r
+            notCJKTokens.push(word);\r
+        } else { \r
+            var tokenizer = new CJKTokenizer(word);\r
+            var tokensTmp = tokenizer.getAllTokens();\r
+            allTokens = allTokens.concat(tokensTmp);\r
+        }\r
+    }\r
+    allTokens = allTokens.concat(tokenize(notCJKTokens));\r
+    return allTokens;\r
+}\r
+\r
+//A simple way to determine whether the query is in english or not.\r
+function getAvgAsciiValue(word){\r
+    var tmp = 0;\r
+    var num = word.length < 5 ? word.length:5;\r
+    for(i=0;i<num;i++){\r
+        if(i==5) break;\r
+        tmp += word.charCodeAt(i);\r
+    }\r
+    return tmp/num;\r
+}\r
  \r
-    document.getElementById('searchResults').innerHTML = results;\r
-\r
-    /* Display results * /\r
-     with (parent.frames['searchresults'].document) {\r
-     writeln("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n<html><head>");\r
-     writeln("<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">");\r
-     //writeln("<link href=\"css/commonltr.css\" type=\"text/css\" rel=\"stylesheet\">" );\r
-     //writeln("<link rel=\"stylesheet\" type=\"text/css\" href=\"css/search.css\">") ;\r
-     writeln("<style>body{\\r
-     font-family: verdana, sans-serif;\\r
-     font-size: .7em;\\r
-     background: #f3f3f3; }\\r
-     .searchExpression{ font-weight: bold;}</style>") ;\r
-     writeln("<title>"+txt_filesfound+"</title></head>");\r
-     writeln("<body onload = \"self.focus()\">");\r
-     //writeln("<h2>" + txt_search_result + " " + "<i>" + wordsList + "</i>" + "</h2>");\r
-\r
-     // If no results, display a message\r
-     if ( txt_wordsnotfound != "" ) {writeln("<p>"+"Your search returned no results for "+ "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" +"</p>")}\r
-\r
-     // If results: display them\r
-     if (linkTab.length > 0  ) {\r
-\r
-     /*writeln ("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">"  + cleanwordsList + "</span>" + "<br/>"+"</p>");* /\r
-     write("<p>");\r
-     //write("<ul class='searchresult'>");\r
-     for (t in linkTab) {\r
-     writeln(linkTab[t].toString())\r
-     }\r
-     writeln("</p>");\r
-     }\r
-\r
-     writeln ("</body></html>");\r
-     close() ;\r
-\r
-     }   */\r
+//CJKTokenizer\r
+function CJKTokenizer(input){\r
+    this.input = input;\r
+    this.offset=-1;\r
+    this.tokens = new Array(); \r
+    this.incrementToken = incrementToken;\r
+    this.tokenize = tokenize;\r
+    this.getAllTokens = getAllTokens;\r
+    this.unique = unique;\r
+\r
+    function incrementToken(){\r
+               if(this.input.length - 2 <= this.offset){\r
+               //      console.log("false "+offset);\r
+                       return false;\r
+               }\r
+               else {\r
+                       this.offset+=1;\r
+                       return true;\r
+               }\r
+       }\r
+\r
+       function tokenize(){\r
+               //document.getElementById("content").innerHTML += x.substring(offset,offset+2)+"<br>";\r
+               return this.input.substring(this.offset,this.offset+2);\r
+       }\r
+\r
+       function getAllTokens(){\r
+               while(this.incrementToken()){\r
+                       var tmp = this.tokenize();\r
+                       this.tokens.push(tmp);\r
+               }\r
+               var sortedTokens = this.unique(this.tokens);\r
+\r
+        return sortedTokens;    \r
+//             document.getElementById("content").innerHTML += tokens+" ";\r
+//             document.getElementById("content").innerHTML += "<br>dada"+sortedTokens+" ";\r
+//             console.log(tokens.length+"dsdsds");\r
+               /*for(i=0;i<tokens.length;i++){\r
+                       console.log(tokens[i]);\r
+                       var ss = tokens[i] == sortedTokens[i];\r
+\r
+//                     document.getElementById("content").innerHTML += "<br>dada"+un[i]+"- "+stems[i]+"&nbsp;&nbsp;&nbsp;"+ ss;\r
+                       document.getElementById("content").innerHTML += "<br>"+sortedTokens[i];\r
+               }*/\r
+       }\r
+\r
+       function unique(a)\r
+       {\r
+          var r = new Array();\r
+          o:for(var i = 0, n = a.length; i < n; i++)\r
+          {\r
+             for(var x = 0, y = r.length; x < y; x++)\r
+             {\r
+                if(r[x]==a[i]) continue o;\r
+             }\r
+             r[r.length] = a[i];\r
+          }\r
+          return r;\r
+       } \r
  }\r
  \r
  \r
@@ -316,9 +379,10 @@ function onScriptLoadedFunc(e) {
      }\r
  }\r
  \r
+/*\r
  function onLoadComplete() {\r
      alert("loaded !!");\r
-}\r
+} */\r
  \r
  /* End of scriptloader functions */\r
  \r
@@ -413,7 +477,11 @@ function SortResults(mots) {
  \r
          var tempDisplay = new Array();\r
          for (var x in tab) {\r
-            tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word.\r
+            if(stemQueryMap[tab[x]] != undefined){\r
+                tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word.\r
+            } else {\r
+                tempDisplay.push(tab[x]); //no stem is available. (probably a CJK language)\r
+            }\r
          }\r
          var tempDispString = tempDisplay.join(", ");\r
  \r
author	Kasun Gajasinghe <kasunbg@gmail.com>
	Sun, 8 Aug 2010 16:43:13 +0000 (16:43 +0000)
committer	Kasun Gajasinghe <kasunbg@gmail.com>
	Sun, 8 Aug 2010 16:43:13 +0000 (16:43 +0000)
xsl/webhelp/indexer/lib/nw-cms.jar		patch \| blob \| history
xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java		patch \| blob \| history
xsl/webhelp/indexer/src/com/nexwave/nquindexer/WriteJSFiles.java		patch \| blob \| history
xsl/webhelp/template/content/search/nwSearchFnt.js		patch \| blob \| history