package com.nexwave.nquindexer;\r
\r
import java.io.File;\r
+import java.io.IOException;\r
+import java.io.Reader;\r
import java.util.*;\r
+import java.io.StringReader;\r
\r
// specific dita ot\r
import com.nexwave.nsidita.DocFileInfo;\r
import com.nexwave.stemmer.snowball.ext.EnglishStemmer;\r
import com.nexwave.stemmer.snowball.ext.GermanStemmer;\r
\r
+//CJK Tokenizing\r
+import org.apache.lucene.analysis.Token;\r
+import org.apache.lucene.analysis.TokenStream;\r
+import org.apache.lucene.analysis.cjk.CJKAnalyzer;\r
+import org.apache.lucene.analysis.cjk.CJKTokenizer;\r
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;\r
+import org.apache.lucene.analysis.tokenattributes.TermAttribute;\r
+\r
\r
/**\r
* Parser for the html files generated by DITA-OT.\r
* Extracts the title, the shortdesc and the text within the "content" div tag. <div id="content">\r
- * \r
+ *\r
* @version 1.1 2010\r
- * \r
+ *\r
* @author N. Quaine\r
* @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
*/\r
cleanUpList = cleanUpStrings;\r
cleanUpPunctuation = cleanUpChars;\r
}\r
- \r
+\r
/**\r
* Initializer\r
*/\r
public int init(Map<String,String> tempMap){\r
tempDico = tempMap;\r
- return 0; \r
+ return 0;\r
}\r
\r
/**\r
- * Parses the file to extract all the words for indexing and \r
- * some data characterizing the file. \r
+ * Parses the file to extract all the words for indexing and\r
+ * some data characterizing the file.\r
* @param file contains the fullpath of the document to parse\r
* @param indexerLanguage this will be used to tell the program which stemmer to be used.\r
* @return a DitaFileInfo object filled with data describing the file\r
//initialization\r
fileDesc = new DocFileInfo(file);\r
strbf = new StringBuffer("");\r
- \r
+\r
// Fill strbf by parsing the file\r
parseDocument(file);\r
- \r
+\r
String str = cleanBuffer(strbf);\r
str = str.replaceAll("\\s+"," "); //there's still redundant spaces in the middle\r
// System.out.println(file.toString()+" "+ str +"\n");\r
//get items one-by-one, tunnel through the stemmer, and get the stem.\r
//Then, add them to tempSet\r
//Do Stemming for words in items\r
- //TODO currently, stemming support is for english only. Add support for other languages as well.\r
- \r
- SnowballStemmer stemmer;\r
- if(indexerLanguage.equals("en")){\r
- stemmer = new EnglishStemmer();\r
- } else if (indexerLanguage.equals("de")){\r
- stemmer= new GermanStemmer();\r
+ //TODO currently, stemming support is for english and german only. Add support for other languages as well.\r
+\r
+ String[] tokenizedItems;\r
+ if(indexerLanguage.equalsIgnoreCase("jp") || indexerLanguage.equalsIgnoreCase("cn")\r
+ || indexerLanguage.equalsIgnoreCase("ko")){\r
+ LinkedList<String> tokens = new LinkedList<String>();\r
+ try{\r
+ CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
+ Reader reader = new StringReader(str);\r
+ TokenStream stream = analyzer.tokenStream("", reader);\r
+ TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);\r
+ OffsetAttribute offAtt = (OffsetAttribute) stream.addAttribute(OffsetAttribute.class);\r
+\r
+ while (stream.incrementToken()) {\r
+ String term = termAtt.term();\r
+ tokens.add(term);\r
+// System.out.println(term + " " + offAtt.startOffset() + " " + offAtt.endOffset());\r
+ }\r
+\r
+ tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
+\r
+ }catch (IOException ex){\r
+ tokenizedItems = items;\r
+ System.out.println("Error tokenizing content using CJK Analyzer. IOException");\r
+ ex.printStackTrace();\r
+ }\r
+\r
} else {\r
- stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+ SnowballStemmer stemmer;\r
+ if(indexerLanguage.equalsIgnoreCase("en")){\r
+ stemmer = new EnglishStemmer();\r
+ } else if (indexerLanguage.equalsIgnoreCase("de")){\r
+ stemmer= new GermanStemmer();\r
+ } else {\r
+ stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+ }\r
+ if(stemmer != null) //If a stemmer available\r
+ tokenizedItems = stemmer.doStem(items);\r
+ else //if no stemmer available for the particular language\r
+ tokenizedItems = items;\r
+\r
}\r
\r
- String[] stemmedItems;\r
- if(stemmer != null) //If a stemmer available\r
- stemmedItems = stemmer.doStem(items);\r
- else //if no stemmer available for the particular language\r
- stemmedItems = items;\r
- \r
- /* for(String stemmedItem: stemmedItems){\r
+ /* for(String stemmedItem: tokenizedItems){\r
System.out.print(stemmedItem+"| ");\r
}*/\r
- \r
+\r
//items: remove the duplicated strings first\r
HashSet <String> tempSet = new HashSet<String>();\r
- tempSet.addAll(Arrays.asList(stemmedItems));\r
+ tempSet.addAll(Arrays.asList(tokenizedItems));\r
Iterator it = tempSet.iterator();\r
String s;\r
while (it.hasNext()) {\r
- s = (String)it.next(); \r
+ s = (String)it.next();\r
if (tempDico.containsKey(s)) {\r
String temp = tempDico.get(s);\r
temp = temp.concat(",").concat(Integer.toString(i));\r
tempDico.put(s, Integer.toString(i));\r
}\r
}\r
- \r
+\r
i++;\r
return fileDesc;\r
}\r
\r
/**\r
- * Cleans the string buffer containing all the text retrieved from \r
- * the html file: remove punctuation, clean white spaces, remove the words \r
+ * Cleans the string buffer containing all the text retrieved from\r
+ * the html file: remove punctuation, clean white spaces, remove the words\r
* which you do not want to index.\r
* NOTE: You may customize this function:\r
- * This version takes into account english and japanese. Depending on your \r
- * needs, \r
- * you may have to add/remove some characters/words through props files \r
+ * This version takes into account english and japanese. Depending on your\r
+ * needs,\r
+ * you may have to add/remove some characters/words through props files\r
* or by modifying tte default code,\r
- * you may want to separate the language processing (doc only in japanese, \r
- * doc only in english, check the language metadata ...). \r
+ * you may want to separate the language processing (doc only in japanese,\r
+ * doc only in english, check the language metadata ...).\r
*/\r
private String cleanBuffer (StringBuffer strbf) {\r
String str = strbf.toString().toLowerCase();\r
StringBuffer tempCharBuf = new StringBuffer("");\r
if ((cleanUpList == null) || (cleanUpList.isEmpty())){\r
// Default clean-up\r
- \r
+\r
// Should perhaps eliminate the words at the end of the table?\r
tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");\r
\r
str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " ");\r
- \r
+\r
\r
//nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
// remove contiguous white charaters\r
- //nqu 25.01.2008 str = str.replaceAll("\\s+", " "); \r
+ //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
}else {\r
// Clean-up using the props files\r
tempStrBuf.append("\\ba\\b");\r
str = minimalClean(str, tempStrBuf, tempCharBuf);\r
return str;\r
}\r
- \r
+\r
private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
String tempPunctuation = new String(tempCharBuf);\r
- \r
+\r
str = str.replaceAll("\\s+", " ");\r
str = str.replaceAll("->", " ");\r
str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
\r
//remove useless words\r
str = str.replaceAll(tempStrBuf.toString(), " ");\r
- \r
+\r
// Redo punctuation after removing some words: (TODO: useful?)\r
str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " "); \r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
if (tempPunctuation.length() > 0)\r
{\r
str = str.replaceAll(tempPunctuation, " ");\r