From: Kasun Gajasinghe
Date: Sun, 8 Aug 2010 16:43:13 +0000 (+0000)
Subject: Added client-side support for cjk searching. By default, for cjk, 2-gram tokenizing...
X-Git-Tag: release/1.79.1~6^2~831^2~40
X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=41638a573b235380306772e2a522ef491add800d;p=docbook-dsssl
Added client-side support for cjk searching. By default, for cjk, 2-gram tokenizing is used in both client run time and indexer build time. i.e. クに接続 will be tokenized to "クに", "に接", "接続". Better tokenizing suggestions are IKAnalzer and Paoding Analyzer. These are dictionary based, so that results are more accurate.
---
diff --git a/xsl/webhelp/indexer/lib/nw-cms.jar b/xsl/webhelp/indexer/lib/nw-cms.jar
index 886f2eb30..95bedc392 100755
Binary files a/xsl/webhelp/indexer/lib/nw-cms.jar and b/xsl/webhelp/indexer/lib/nw-cms.jar differ
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
index 2a347d3f5..cfde68342 100755
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
@@ -51,7 +51,7 @@ public class IndexerTask{
// ANT parameters
private String htmldir=null;
- private String indexerLanguage="en";
+ public static String indexerLanguage="en";
//supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,
// as stemmers doesn't find a difference between them.
@@ -90,7 +90,7 @@ public class IndexerTask{
int i=0;
for (;i= 0; i--) {
+ for (var i = cpt - 1; i >= 0; i--) {
if (fileAndWordList[i] != undefined) {
-
linkTab.push("" + txt_results_for + " " + "" + fileAndWordList[i][0].motslisteDisplay + " " + "
");
linkTab.push("");
for (t in fileAndWordList[i]) {
//DEBUG: alert(": "+ fileAndWordList[i][t].filenb+" " +fileAndWordList[i][t].motsliste);
//linkTab.push(""+fl[fileAndWordList[i][t].filenb]+" ");
-
-
tempInfo = fil[fileAndWordList[i][t].filenb];
pos1 = tempInfo.indexOf("@@@");
pos2 = tempInfo.lastIndexOf("@@@");
@@ -151,24 +131,21 @@ function Effectuer_recherche(expressionInput) {
tempShortdesc = tempInfo.substring(pos2 + 3, tempInfo.length);
//file:///home/kasun/docbook/WEBHELP/webhelp-draft-output-format-idea/src/main/resources/web/webhelp/installation.html
- var linkString = "" + tempTitle + " ";
- // var linkString = "" + tempTitle + " ";
+ var linkString = "" + tempTitle + " ";
+ // var linkString = "" + tempTitle + " ";
if ((tempShortdesc != "null")) {
linkString += "\n" + tempShortdesc + "
";
}
linkString += " ";
-
linkTab.push(linkString);
-
}
linkTab.push(" ");
}
}
}
- var results="";
- if (linkTab.length > 0) {
-
+ var results = "";
+ if (linkTab.length > 0) {
/*writeln ("" + txt_results_for + " " + "" + cleanwordsList + " " + " "+"
");*/
results = "";
//write("
");
@@ -176,47 +153,133 @@ function Effectuer_recherche(expressionInput) {
results += linkTab[t].toString();
}
results += "
";
- } else{
- results = ""+"Your search returned no results for "+ "" + txt_wordsnotfound + " " +"
";
+ } else {
+ results = "" + "Your search returned no results for " + "" + txt_wordsnotfound + " " + "
";
}
//alert(results);
+ document.getElementById('searchResults').innerHTML = results;
+}
+
+function tokenize(wordsList){
+ var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces
+ var cleanwordsList = new Array(); // Array with the words to look for
+ for(var j in wordsList){
+ var word = wordsList[j];
+ if(typeof stemmer != "undefined" ){
+ stemQueryMap[stemmer(word)] = word;
+ } else {
+ stemQueryMap[word] = word;
+ }
+ }
+ //stemmedWordsList is the stemmed list of words separated by spaces.
+ for (t in wordsList) {
+ wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "")
+ if (wordsList[t] != "%20") {
+ scriptLetterTab.add(wordsList[t].charAt(0));
+ cleanwordsList.push(wordsList[t]);
+ }
+ }
+
+ if(typeof stemmer != "undefined" ){
+ //Do the stemming using Porter's stemming algorithm
+ for (var i = 0; i < cleanwordsList.length; i++) {
+ var stemWord = stemmer(cleanwordsList[i]);
+ stemmedWordsList.push(stemWord);
+ }
+ } else {
+ stemmedWordsList = cleanwordsList;
+ }
+ return stemmedWordsList;
+}
+
+function cjkTokenize(wordsList){
+ var allTokens= new Array();
+ var notCJKTokens= new Array();
+ var j=0;
+ for(j=0;j\n");
- writeln(" ");
- //writeln(" " );
- //writeln(" ") ;
- writeln("") ;
- writeln(""+txt_filesfound+" ");
- writeln("");
- //writeln("" + txt_search_result + " " + "" + wordsList + " " + " ");
-
- // If no results, display a message
- if ( txt_wordsnotfound != "" ) {writeln(""+"Your search returned no results for "+ "" + txt_wordsnotfound + " " +"
")}
-
- // If results: display them
- if (linkTab.length > 0 ) {
-
- /*writeln ("" + txt_results_for + " " + "" + cleanwordsList + " " + " "+"
");* /
- write("");
- //write("
");
- for (t in linkTab) {
- writeln(linkTab[t].toString())
- }
- writeln("");
- }
-
- writeln ("");
- close() ;
-
- } */
+//CJKTokenizer
+function CJKTokenizer(input){
+ this.input = input;
+ this.offset=-1;
+ this.tokens = new Array();
+ this.incrementToken = incrementToken;
+ this.tokenize = tokenize;
+ this.getAllTokens = getAllTokens;
+ this.unique = unique;
+
+ function incrementToken(){
+ if(this.input.length - 2 <= this.offset){
+ // console.log("false "+offset);
+ return false;
+ }
+ else {
+ this.offset+=1;
+ return true;
+ }
+ }
+
+ function tokenize(){
+ //document.getElementById("content").innerHTML += x.substring(offset,offset+2)+" ";
+ return this.input.substring(this.offset,this.offset+2);
+ }
+
+ function getAllTokens(){
+ while(this.incrementToken()){
+ var tmp = this.tokenize();
+ this.tokens.push(tmp);
+ }
+ var sortedTokens = this.unique(this.tokens);
+
+ return sortedTokens;
+// document.getElementById("content").innerHTML += tokens+" ";
+// document.getElementById("content").innerHTML += " dada"+sortedTokens+" ";
+// console.log(tokens.length+"dsdsds");
+ /*for(i=0;idada"+un[i]+"- "+stems[i]+" "+ ss;
+ document.getElementById("content").innerHTML += " "+sortedTokens[i];
+ }*/
+ }
+
+ function unique(a)
+ {
+ var r = new Array();
+ o:for(var i = 0, n = a.length; i < n; i++)
+ {
+ for(var x = 0, y = r.length; x < y; x++)
+ {
+ if(r[x]==a[i]) continue o;
+ }
+ r[r.length] = a[i];
+ }
+ return r;
+ }
}
@@ -316,9 +379,10 @@ function onScriptLoadedFunc(e) {
}
}
+/*
function onLoadComplete() {
alert("loaded !!");
-}
+} */
/* End of scriptloader functions */
@@ -413,7 +477,11 @@ function SortResults(mots) {
var tempDisplay = new Array();
for (var x in tab) {
- tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word.
+ if(stemQueryMap[tab[x]] != undefined){
+ tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word.
+ } else {
+ tempDisplay.push(tab[x]); //no stem is available. (probably a CJK language)
+ }
}
var tempDispString = tempDisplay.join(", ");