*/\r
\r
//string initialization\r
-\r
-\r
-htmlfileList = "htmlFileList.js";\r
-htmlfileinfoList = "htmlFileInfoList.js";\r
-\r
+var htmlfileList = "htmlFileList.js";\r
+var htmlfileinfoList = "htmlFileInfoList.js";\r
+var useCJKTokenizing = false;\r
\r
/* Cette fonction verifie la validite de la recherche entrre par l utilisateur */\r
function Verifie(ditaSearch_Form) {\r
//DisplayWaitingMessage();\r
\r
/*data initialisation*/\r
- searchFor = ""; // expression en lowercase et sans les caracteres speciaux\r
+ searchFor = ""; // expression en lowercase et sans les caracte res speciaux\r
//w = new Object(); // hashtable, key=word, value = list of the index of the html files\r
scriptLetterTab = new scriptfirstchar(); // Array containing the first letter of each word to look for\r
var scriptsarray = new Array(); // Array with the name of the scripts to load\r
var wordsList = new Array(); // Array with the words to look for\r
- var cleanwordsList = new Array(); // Array with the words to look for\r
- var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces\r
+ var finalWordsList = new Array(); // Array with the words to look for after removing spaces\r
var listNumerosDesFicStr = "";\r
var ou_recherche = true;\r
var linkTab = new Array();\r
wordsList = searchFor.split(" ");\r
wordsList.sort();\r
\r
- for(var j in wordsList){\r
- var word = wordsList[j];\r
- if(typeof stemmer != "undefined" ){\r
- stemQueryMap[stemmer(word)] = word;\r
- } else {\r
- stemQueryMap[word] = word;\r
- }\r
- }\r
-\r
- //stemmedWordsList is the stemmed list of words separated by spaces.\r
- for (t in wordsList) {\r
- wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "")\r
- if (wordsList[t] != "%20") {\r
- scriptLetterTab.add(wordsList[t].charAt(0));\r
- cleanwordsList.push(wordsList[t]);\r
- }\r
- }\r
-\r
- if(typeof stemmer != "undefined" ){\r
- //Do the stemming using Porter's stemming algorithm\r
- for (var i = 0; i < cleanwordsList.length; i++) {\r
- var stemWord = stemmer(cleanwordsList[i]);\r
- stemmedWordsList.push(stemWord);\r
- }\r
+ //set the tokenizing method\r
+ if(typeof indexerLanguage != "undefined" && (indexerLanguage=="cn" || indexerLanguage=="ja" ||indexerLanguage=="ko")){\r
+ useCJKTokenizing=true;\r
} else {\r
- stemmedWordsList = cleanwordsList;\r
+ useCJKTokenizing=false;\r
+ }\r
+ //If Lucene CJKTokenizer was used as the indexer, then useCJKTokenizing will be true. Else, do normal tokenizing.\r
+ // 2-gram tokenizinghappens in CJKTokenizing, \r
+ if(useCJKTokenizing){\r
+ finalWordsList = cjkTokenize(wordsList);\r
+ } else { \r
+ finalWordsList = tokenize(wordsList);\r
}\r
\r
//load the scripts with the indices: the following lines do not work on the server. To be corrected\r
* Compare with the indexed words (in the w[] array), and push words that are in it to tempTab.\r
*/\r
var tempTab = new Array();\r
- for (t in stemmedWordsList) {\r
- if (w[stemmedWordsList[t].toString()] == undefined) {\r
- txt_wordsnotfound += stemmedWordsList[t] + " ";\r
+ for (t in finalWordsList) {\r
+ if (w[finalWordsList[t].toString()] == undefined) {\r
+ txt_wordsnotfound += finalWordsList[t] + " ";\r
} else {\r
- tempTab.push(stemmedWordsList[t]);\r
+ tempTab.push(finalWordsList[t]);\r
}\r
}\r
- stemmedWordsList = tempTab;\r
+ finalWordsList = tempTab;\r
\r
- if (stemmedWordsList.length) {\r
+ if (finalWordsList.length) {\r
\r
- // recherche 'et' et 'ou' en une fois\r
- fileAndWordList = SortResults(stemmedWordsList);\r
+ //search 'and' and 'or' one time\r
+ fileAndWordList = SortResults(finalWordsList);\r
\r
cpt = fileAndWordList.length;\r
- for (i = cpt - 1; i >= 0; i--) {\r
+ for (var i = cpt - 1; i >= 0; i--) {\r
if (fileAndWordList[i] != undefined) {\r
-\r
linkTab.push("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + fileAndWordList[i][0].motslisteDisplay + "</span>" + "</p>");\r
\r
linkTab.push("<ul class='searchresult'>");\r
for (t in fileAndWordList[i]) {\r
//DEBUG: alert(": "+ fileAndWordList[i][t].filenb+" " +fileAndWordList[i][t].motsliste);\r
//linkTab.push("<li><a href=\"../"+fl[fileAndWordList[i][t].filenb]+"\">"+fl[fileAndWordList[i][t].filenb]+"</a></li>");\r
-\r
-\r
tempInfo = fil[fileAndWordList[i][t].filenb];\r
pos1 = tempInfo.indexOf("@@@");\r
pos2 = tempInfo.lastIndexOf("@@@");\r
tempShortdesc = tempInfo.substring(pos2 + 3, tempInfo.length);\r
\r
//file:///home/kasun/docbook/WEBHELP/webhelp-draft-output-format-idea/src/main/resources/web/webhelp/installation.html\r
- var linkString = "<li><a href=" + tempPath + ">" + tempTitle + "</a>";\r
- // var linkString = "<li><a href=\"installation.html\">" + tempTitle + "</a>";\r
+ var linkString = "<li><a href=" + tempPath + ">" + tempTitle + "</a>";\r
+ // var linkString = "<li><a href=\"installation.html\">" + tempTitle + "</a>";\r
if ((tempShortdesc != "null")) {\r
linkString += "\n<div class=\"shortdesclink\">" + tempShortdesc + "</div>";\r
}\r
linkString += "</li>";\r
-\r
linkTab.push(linkString);\r
-\r
}\r
linkTab.push("</ul>");\r
}\r
}\r
}\r
\r
- var results="";\r
- if (linkTab.length > 0) {\r
-\r
+ var results = "";\r
+ if (linkTab.length > 0) { \r
/*writeln ("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + cleanwordsList + "</span>" + "<br/>"+"</p>");*/\r
results = "<p>";\r
//write("<ul class='searchresult'>");\r
results += linkTab[t].toString();\r
}\r
results += "</p>";\r
- } else{\r
- results = "<p>"+"Your search returned no results for "+ "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" +"</p>";\r
+ } else {\r
+ results = "<p>" + "Your search returned no results for " + "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" + "</p>";\r
}\r
//alert(results);\r
+ document.getElementById('searchResults').innerHTML = results; \r
+}\r
+\r
+function tokenize(wordsList){\r
+ var stemmedWordsList = new Array(); // Array with the words to look for after removing spaces\r
+ var cleanwordsList = new Array(); // Array with the words to look for\r
+ for(var j in wordsList){\r
+ var word = wordsList[j];\r
+ if(typeof stemmer != "undefined" ){\r
+ stemQueryMap[stemmer(word)] = word;\r
+ } else {\r
+ stemQueryMap[word] = word;\r
+ }\r
+ } \r
+ //stemmedWordsList is the stemmed list of words separated by spaces.\r
+ for (t in wordsList) {\r
+ wordsList[t] = wordsList[t].replace(/(%22)|^-/g, "")\r
+ if (wordsList[t] != "%20") {\r
+ scriptLetterTab.add(wordsList[t].charAt(0));\r
+ cleanwordsList.push(wordsList[t]);\r
+ }\r
+ }\r
+\r
+ if(typeof stemmer != "undefined" ){\r
+ //Do the stemming using Porter's stemming algorithm\r
+ for (var i = 0; i < cleanwordsList.length; i++) {\r
+ var stemWord = stemmer(cleanwordsList[i]);\r
+ stemmedWordsList.push(stemWord);\r
+ }\r
+ } else {\r
+ stemmedWordsList = cleanwordsList;\r
+ }\r
+ return stemmedWordsList;\r
+}\r
+\r
+function cjkTokenize(wordsList){\r
+ var allTokens= new Array();\r
+ var notCJKTokens= new Array();\r
+ var j=0;\r
+ for(j=0;j<wordsList.length;j++){\r
+ var word = wordsList[j];\r
+ if(getAvgAsciiValue(word) < 127){\r
+ notCJKTokens.push(word);\r
+ } else { \r
+ var tokenizer = new CJKTokenizer(word);\r
+ var tokensTmp = tokenizer.getAllTokens();\r
+ allTokens = allTokens.concat(tokensTmp);\r
+ }\r
+ }\r
+ allTokens = allTokens.concat(tokenize(notCJKTokens));\r
+ return allTokens;\r
+}\r
+\r
+//A simple way to determine whether the query is in english or not.\r
+function getAvgAsciiValue(word){\r
+ var tmp = 0;\r
+ var num = word.length < 5 ? word.length:5;\r
+ for(i=0;i<num;i++){\r
+ if(i==5) break;\r
+ tmp += word.charCodeAt(i);\r
+ }\r
+ return tmp/num;\r
+}\r
\r
- document.getElementById('searchResults').innerHTML = results;\r
-\r
- /* Display results * /\r
- with (parent.frames['searchresults'].document) {\r
- writeln("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\n<html><head>");\r
- writeln("<meta http-equiv=\"content-type\" content=\"text/html; charset=utf-8\">");\r
- //writeln("<link href=\"css/commonltr.css\" type=\"text/css\" rel=\"stylesheet\">" );\r
- //writeln("<link rel=\"stylesheet\" type=\"text/css\" href=\"css/search.css\">") ;\r
- writeln("<style>body{\\r
- font-family: verdana, sans-serif;\\r
- font-size: .7em;\\r
- background: #f3f3f3; }\\r
- .searchExpression{ font-weight: bold;}</style>") ;\r
- writeln("<title>"+txt_filesfound+"</title></head>");\r
- writeln("<body onload = \"self.focus()\">");\r
- //writeln("<h2>" + txt_search_result + " " + "<i>" + wordsList + "</i>" + "</h2>");\r
-\r
- // If no results, display a message\r
- if ( txt_wordsnotfound != "" ) {writeln("<p>"+"Your search returned no results for "+ "<span class=\"searchExpression\">" + txt_wordsnotfound + "</span>" +"</p>")}\r
-\r
- // If results: display them\r
- if (linkTab.length > 0 ) {\r
-\r
- /*writeln ("<p>" + txt_results_for + " " + "<span class=\"searchExpression\">" + cleanwordsList + "</span>" + "<br/>"+"</p>");* /\r
- write("<p>");\r
- //write("<ul class='searchresult'>");\r
- for (t in linkTab) {\r
- writeln(linkTab[t].toString())\r
- }\r
- writeln("</p>");\r
- }\r
-\r
- writeln ("</body></html>");\r
- close() ;\r
-\r
- } */\r
+//CJKTokenizer\r
+function CJKTokenizer(input){\r
+ this.input = input;\r
+ this.offset=-1;\r
+ this.tokens = new Array(); \r
+ this.incrementToken = incrementToken;\r
+ this.tokenize = tokenize;\r
+ this.getAllTokens = getAllTokens;\r
+ this.unique = unique;\r
+\r
+ function incrementToken(){\r
+ if(this.input.length - 2 <= this.offset){\r
+ // console.log("false "+offset);\r
+ return false;\r
+ }\r
+ else {\r
+ this.offset+=1;\r
+ return true;\r
+ }\r
+ }\r
+\r
+ function tokenize(){\r
+ //document.getElementById("content").innerHTML += x.substring(offset,offset+2)+"<br>";\r
+ return this.input.substring(this.offset,this.offset+2);\r
+ }\r
+\r
+ function getAllTokens(){\r
+ while(this.incrementToken()){\r
+ var tmp = this.tokenize();\r
+ this.tokens.push(tmp);\r
+ }\r
+ var sortedTokens = this.unique(this.tokens);\r
+\r
+ return sortedTokens; \r
+// document.getElementById("content").innerHTML += tokens+" ";\r
+// document.getElementById("content").innerHTML += "<br>dada"+sortedTokens+" ";\r
+// console.log(tokens.length+"dsdsds");\r
+ /*for(i=0;i<tokens.length;i++){\r
+ console.log(tokens[i]);\r
+ var ss = tokens[i] == sortedTokens[i];\r
+\r
+// document.getElementById("content").innerHTML += "<br>dada"+un[i]+"- "+stems[i]+" "+ ss;\r
+ document.getElementById("content").innerHTML += "<br>"+sortedTokens[i];\r
+ }*/\r
+ }\r
+\r
+ function unique(a)\r
+ {\r
+ var r = new Array();\r
+ o:for(var i = 0, n = a.length; i < n; i++)\r
+ {\r
+ for(var x = 0, y = r.length; x < y; x++)\r
+ {\r
+ if(r[x]==a[i]) continue o;\r
+ }\r
+ r[r.length] = a[i];\r
+ }\r
+ return r;\r
+ } \r
}\r
\r
\r
}\r
}\r
\r
+/*\r
function onLoadComplete() {\r
alert("loaded !!");\r
-}\r
+} */\r
\r
/* End of scriptloader functions */\r
\r
\r
var tempDisplay = new Array();\r
for (var x in tab) {\r
- tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word.\r
+ if(stemQueryMap[tab[x]] != undefined){\r
+ tempDisplay.push(stemQueryMap[tab[x]]); //get the original word from the stem word.\r
+ } else {\r
+ tempDisplay.push(tab[x]); //no stem is available. (probably a CJK language)\r
+ }\r
}\r
var tempDispString = tempDisplay.join(", ");\r
\r