/**
* The content language defaults to English "en"
- * @param htmlDir The directory where html files resides.
+ * @param htmlDir The directory where html files reside.
*/
public IndexerMain(String htmlDir) {
super();
System.out.println("Delay = " + diff / 1000 + " seconds");
} else {
System.out.println(txt_wrong_dita_basedir);
- return;
}
}
+++ /dev/null
-/*\r
-package com.nexwave.nquindexer;\r
-\r
-import java.io.File;\r
-import java.io.FileInputStream;\r
-import java.io.IOException;\r
-import java.util.ArrayList;\r
-import java.util.Collection;\r
-import java.util.Date;\r
-import java.util.HashMap;\r
-import java.util.Iterator;\r
-import java.util.Map;\r
-import java.util.Properties;\r
-\r
-import org.apache.tools.ant.BuildException;\r
-import org.apache.tools.ant.Task;\r
-\r
-\r
-import com.nexwave.nsidita.DirList;\r
-import com.nexwave.nsidita.DocFileInfo;\r
-\r
-*/\r
-/**\r
- * Indexer ant task.\r
- * \r
- * @version 1.0 2008-02-26\r
- * \r
- * @author N. Quaine\r
- * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
- *//*\r
-\r
-public class IndexerTask extends Task {\r
-\r
- // messages\r
- private String txt_no_inputdir = "Input directory not found:";\r
- private String txt_cannot_create_outputdir = "Cannot create output search directory.";\r
- private String txt_no_files_found = "No html files found.";\r
- private String txt_wrong_dita_basedir = "ERROR: Parser initialization failed. Wrong dita base dir";\r
- private String txt_no_relative_files_found= "No relative html files calculated.";\r
- private String txt_no_words_gathered= "No words have been indexed in";\r
- private String txt_no_html_files="No HTML Files found in";\r
- private String txt_no_args="No argument given: you must provide an htmlDir to the IndexerTask";\r
- \r
- //working directories\r
- private String searchdir = "search";\r
- private File inputDir = null;\r
- private String outputDir = null;\r
- private String projectDir = null;\r
-\r
- // ANT parameters\r
- private String htmlDir=null;\r
- public static String indexerLanguage="en";\r
-\r
- //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,\r
- // as stemmers doesn't find a difference between them.\r
- private String[] supportedLanguages= {"en", "de", "fr", "zh", "ja", "ko"}; //currently extended support available for\r
- // English, German, French and CJK (Chinese [zh], Japanese [ja], Korean [ko]) languages only.\r
-\r
- // Indexing features: words to remove\r
- private ArrayList<String> cleanUpStrings = null; \r
- private ArrayList<String> cleanUpChars = null;\r
-\r
- //Html extension\r
- private String htmlExtension = "html";\r
- \r
- // Constructor\r
- public IndexerTask() {\r
- super();\r
- }\r
- */\r
-/** The setter for the "htmlDir" attribute (parameter of the task)\r
- * @param htmldir\r
- * @throws InterruptedException \r
- *//*\r
-\r
- public void setHtmlDir(String htmlDir) {\r
- this.htmlDir = htmlDir;\r
- }\r
-\r
- */\r
-/**\r
- * Set the extension in which html files are generated\r
- * @param htmlExtension The extension in wich html files are generated\r
- *//*\r
-\r
- public void setHtmlextension(String htmlExtension) {\r
- this.htmlExtension = htmlExtension;\r
- //Trim the starting "."\r
- if(this.htmlExtension.startsWith(".")) {\r
- this.htmlExtension = this.htmlExtension.substring(1);\r
- }\r
- }\r
-\r
- */\r
-/**\r
- * setter for "indexerLanguage" attribute from ANT\r
- * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.\r
- * @throws InterruptedException for ant\r
- *//*\r
-\r
- public void setIndexerLanguage(String indexerLanguage){\r
- if(indexerLanguage !=null && !"".equals(indexerLanguage)) {\r
- int temp = indexerLanguage.indexOf('_');\r
- if( temp != -1){\r
- indexerLanguage = indexerLanguage.substring(0,temp);\r
- }\r
- int i=0;\r
- for (;i<supportedLanguages.length;i++) {\r
- if(indexerLanguage.equals(supportedLanguages[i])){\r
- IndexerTask.indexerLanguage = supportedLanguages[i];\r
- break;\r
- }\r
- }\r
- \r
- //if not in supported language list,\r
- if(i>=supportedLanguages.length){\r
-// System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " +\r
-// "searching. Check documentation for details. ");\r
- IndexerTask.indexerLanguage = indexerLanguage;\r
- } \r
- } else {\r
- IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.\r
- } \r
- }\r
- \r
- */\r
-/**\r
- * Implementation of the execute function (Task interface)\r
- *//*\r
-\r
- public void execute() throws BuildException {\r
- try{\r
- //Use Xerces as the parser. Does not support Saxon6.5.5 parser \r
- System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");\r
- System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.xerces.jaxp.SAXParserFactoryImpl");\r
-// System.setProperty("org.xml.sax.driver", "com.icl.saxon.aelfred.SAXDriver");\r
-// System.setProperty("javax.xml.parsers.SAXParserFactory", "com.icl.saxon.aelfred.SAXParserFactoryImpl");\r
- } catch (SecurityException se){\r
- System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " +\r
- "is not in your CLASSPATH.");\r
- } catch (Exception e){\r
- System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " +\r
- "is not in your CLASSPATH");\r
- }\r
-\r
- ArrayList<DocFileInfo> filesDescription = null; // list of information about the topic files\r
- ArrayList<File> htmlFiles = null; // topic files listed in the given directory\r
- ArrayList<String> htmlFilesPathRel = null;\r
- Map<String, String> tempDico = new HashMap<String, String>(); \r
- Iterator it;\r
- \r
- //File name initialization\r
- String htmlList = "htmlFileList.js";\r
- String htmlInfoList = "htmlFileInfoList.js";\r
- String indexName = ".js";\r
- \r
- //timing\r
- Date dateStart = new Date();\r
- \r
- if (htmlDir == null) {\r
- System.out.println(txt_no_args + ".");\r
- return;\r
- }\r
- // Init input directory\r
- inputDir = new File(htmlDir);\r
-\r
- // Begin of init\r
- // check if inputdir initialized\r
- if (inputDir == null) {\r
- DisplayHelp();\r
- return;\r
- }\r
- \r
- // check if inputdir exists \r
- if (!inputDir.exists()) {\r
- System.out.println(txt_no_inputdir + " "+ inputDir + ".");\r
- return;\r
- }\r
- \r
- // check if outputdir defined\r
- if (outputDir == null) {\r
- //set the output directory: path= {inputDir}/search \r
- outputDir = inputDir.getPath().concat(File.separator).concat(searchdir);\r
- }\r
-\r
- // check if outputdir exists\r
- File tempfile = new File(outputDir); \r
- if (!tempfile.exists()) {\r
- boolean b = (new File(outputDir)).mkdir();\r
- if (!b) {\r
- System.out.println(txt_cannot_create_outputdir + " "+ outputDir + ".");\r
- return;\r
- }\r
- }\r
- \r
- // check if projdir is defined\r
- if (projectDir == null) {\r
- projectDir = inputDir.getPath();\r
- }\r
- //end of init\r
- \r
-\r
- // Get the list of all html files but the tocs, covers and indexes\r
- DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);\r
- htmlFiles = nsiDoc.getListFiles();\r
- // Check if found html files\r
- if (htmlFiles.isEmpty()) {\r
- System.out.println(txt_no_html_files + " "+ inputDir + ".");\r
- return;\r
- }\r
- // Get the list of all html files with relative paths \r
- htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir);\r
- \r
- if (htmlFiles == null) {\r
- System.out.println(txt_no_files_found);\r
- return;\r
- } else if (htmlFilesPathRel == null) {\r
- System.out.println(txt_no_relative_files_found);\r
- return; \r
- }\r
- \r
- // Create the list of the existing html files (index starts at 0)\r
- WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel);\r
- \r
- // Parse each html file to retrieve the words:\r
- // ------------------------------------------\r
- \r
- // Retrieve the clean-up properties for indexing\r
- RetrieveCleanUpProps();\r
- // System.out.print("clean"+" " +cleanUpStrings);\r
- \r
- //create a default handler\r
- //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
- //SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings); // use clean-up props files\r
- SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings, cleanUpChars); // use clean-up props files\r
-\r
- if ( spe.init(tempDico) == 0 ) {\r
-\r
- //create a html file description list\r
- filesDescription = new ArrayList <DocFileInfo> ();\r
- \r
- it = htmlFiles.iterator ( ) ;\r
- \r
- // parse each html files\r
- while ( it.hasNext ( ) ) {\r
- File ftemp = (File) it.next();\r
- //tempMap.put(key, value);\r
- //The HTML file information are added in the list of FileInfoObject\r
- DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage));\r
- \r
- ftemp = docFileInfoTemp.getFullpath();\r
- String stemp = ftemp.toString(); \r
- int i = stemp.indexOf(projectDir);\r
- if ( i != 0 ) {\r
- System.out.println("the documentation root does not match with the documentation input!");\r
- return;\r
- }\r
- int ad = 1;\r
- if (stemp.equals(projectDir)) ad = 0; \r
- stemp = stemp.substring(i+projectDir.length()+ad); //i is redundant (i==0 always)\r
- ftemp = new File (stemp);\r
- docFileInfoTemp.setFullpath(ftemp);\r
- \r
- filesDescription.add(docFileInfoTemp);\r
- }\r
- */\r
-/*remove empty strings from the map*//*\r
-\r
- if (tempDico.containsKey("")) {\r
- tempDico.remove("");\r
- }\r
- // write the index files\r
- if (tempDico.isEmpty()) {\r
- System.out.println(txt_no_words_gathered + " "+ inputDir + ".");\r
- return;\r
- }\r
- \r
- WriteJSFiles.WriteIndex(outputDir.concat(File.separator).concat(indexName), tempDico);\r
- \r
- // write the html list file with title and shortdesc\r
- //create the list of the existing html files (index starts at 0)\r
- WriteJSFiles.WriteHTMLInfoList(outputDir.concat(File.separator).concat(htmlInfoList), filesDescription);\r
- \r
- //perf measurement\r
- Date dateEnd = new Date();\r
- long diff = dateEnd.getTime() - dateStart.getTime();\r
- if(diff<1000)\r
- System.out.println("Delay = " + diff + " milliseconds");\r
- else\r
- System.out.println("Delay = " + diff/1000 + " seconds");\r
- }else {\r
- System.out.println(txt_wrong_dita_basedir);\r
- return;\r
- }\r
- }\r
- \r
- */\r
-/**\r
- * Prints the usage information for this class to <code>System.out</code>.\r
- *//*\r
-\r
- private static void DisplayHelp() {\r
- String lSep = System.getProperty("line.separator");\r
- StringBuffer msg = new StringBuffer();\r
- msg.append("USAGE:" + lSep); \r
- msg.append(" java -classpath TesterIndexer inputDir outputDir projectDir" + lSep);\r
- msg.append("with:" + lSep);\r
- msg.append(" inputDir (mandatory) : specify the html files ' directory to index" + lSep);\r
- msg.append(" outputDir (optional) : specify where to output the index files" + lSep);\r
- msg.append(" projectDir (optional) : specify the root of the documentation directory" + lSep);\r
- msg.append("Example:" + lSep);\r
- msg.append(" java -classpath TesterIndexer /home/$USER/DITA/doc" + lSep);\r
- msg.append("Example 2:" + lSep);\r
- msg.append(" java -classpath TesterIndexer /home/$USER/DITA/doc/customer/concepts /home/$USER/temp/search /home/$USER/DITA/doc/" + lSep);\r
- System.out.println(msg.toString());\r
- }\r
- private int RetrieveCleanUpProps (){\r
-\r
- // Files for punctuation (only one for now)\r
- String[] punctuationFiles = new String[] {"punctuation.props"};\r
- FileInputStream input;\r
- String tempStr;\r
- File ftemp;\r
- Collection c = new ArrayList<String>();\r
-\r
- // Get the list of the props file containing the words to remove (not the punctuation)\r
- DirList props = new DirList(inputDir, "^(?!(punctuation)).*\\.props$", 1);\r
- ArrayList<File> wordsList = props.getListFiles();\r
-// System.out.println("props files:"+wordsList);\r
- //TODO all properties are taken to a single arraylist. does it ok?.\r
- Properties enProps =new Properties ();\r
- String propsDir = inputDir.getPath().concat(File.separator).concat(searchdir);\r
- \r
- // Init the lists which will contain the words and chars to remove \r
- cleanUpStrings = new ArrayList<String>();\r
- cleanUpChars = new ArrayList<String>();\r
- \r
- try {\r
- // Retrieve words to remove\r
- for (File aWordsList : wordsList) {\r
- ftemp = aWordsList;\r
- if (ftemp.exists()) {\r
- enProps.load(input = new FileInputStream(ftemp.getAbsolutePath()));\r
- input.close();\r
- c = enProps.values();\r
- cleanUpStrings.addAll(c);\r
- enProps.clear();\r
- }\r
- }\r
-\r
- // Retrieve char to remove (punctuation for ex.)\r
- for (String punctuationFile : punctuationFiles) {\r
- tempStr = propsDir.concat(File.separator).concat(punctuationFile);\r
- ftemp = new File(tempStr);\r
- if (ftemp.exists()) {\r
- enProps.load(input = new FileInputStream(tempStr));\r
- input.close();\r
- c = enProps.values();\r
- cleanUpChars.addAll(c);\r
- enProps.clear();\r
- }\r
- }\r
- }\r
- catch (IOException e) {\r
- e.printStackTrace();\r
- return 1;\r
- }\r
- return 0;\r
- }\r
-\r
-}\r
-*/\r
\r
/**\r
* Generic parser for populating a DocFileInfo object.\r
- * \r
+ *\r
* @version 2.0 2010-08-14\r
- * \r
+ *\r
* @author N. Quaine\r
* @author Kasun Gajasinghe\r
*/\r
public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {\r
- \r
+\r
//members\r
protected DocFileInfo fileDesc = null;\r
protected String projectDir = null;\r
public SaxDocFileParser () {\r
\r
}\r
- \r
+\r
/**\r
* Initializer\r
*/\r
}\r
\r
/**\r
- * Parses the file to extract all the words for indexing and \r
- * some data characterizing the file. \r
- * @param file contains the fullpath of the document to parse \r
+ * Parses the file to extract all the words for indexing and\r
+ * some data characterizing the file.\r
+ * @param file contains the fullpath of the document to parse\r
* @return a DitaFileInfo object filled with data describing the file\r
*/\r
public DocFileInfo runExtractData(File file) {\r
//initialization\r
fileDesc = new DocFileInfo(file);\r
strbf = new StringBuffer("");\r
- \r
+\r
// Fill strbf by parsing the file\r
parseDocument(file);\r
\r
public void parseDocument (File file) {\r
// System.out.println(System.getProperty("org.xml.sax.driver"));\r
// System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory"));\r
- \r
+\r
//get a factory\r
javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();\r
\r
\r
//parse the file and also register this class for call backs\r
//System.out.println("Parsing: " + file);\r
- \r
+\r
long start = System.currentTimeMillis();\r
//System.out.println("about to parse " + file.getName() + " >>> " + start);\r
\r
is.setSystemId(file.toURI().toURL().toString());\r
sp.parse(is, this);\r
}\r
- \r
+\r
long finish = System.currentTimeMillis();\r
//System.out.println("done parsing " + file.getName() + " >>> " + finish);\r
//System.out.println("time = " + (finish - start) + " milliseconds");\r
- \r
+\r
}catch(SAXParseException spe){\r
System.out.println("SaxParseException: The indexing file contains incorrect xml syntax.");\r
spe.printStackTrace();\r
}catch(org.xml.sax.SAXException se) {\r
System.out.println("SaxException. You may need to include Xerces in your classpath. " +\r
"See documentation for details");\r
- se.printStackTrace(); \r
+ se.printStackTrace();\r
}catch(javax.xml.parsers.ParserConfigurationException pce) {\r
pce.printStackTrace();\r
}catch (IOException ie) {\r
ie.printStackTrace();\r
}\r
}\r
- \r
+\r
private boolean addContent = false;\r
private boolean addHeaderInfo = false;\r
private boolean doNotIndex=false;\r
if((qName.equalsIgnoreCase("meta")) ) {\r
addHeaderInfo = true;\r
String attrName = attributes.getValue("name");\r
- // OXYGEN PATCH START EXM-20576 - add scoring for keywords
- if(attrName != null && (attrName.equalsIgnoreCase("keywords")
- || attrName.equalsIgnoreCase("description")
- || attrName.equalsIgnoreCase("indexterms")
- )){
- if (attrName.equalsIgnoreCase("keywords")) {
- String[] keywords = attributes.getValue("content").split(", ");
- for (int i = 0; i < keywords.length; i++) {
- strbf.append(" " + keywords[i] + "@@@elem_meta_keywords@@@ ");
- }
- } else if (attrName.equalsIgnoreCase("indexterms")) {
- String[] indexterms = attributes.getValue("content").split(", ");
- for (int i = 0; i < indexterms.length; i++) {
- strbf.append(" " + indexterms[i] + "@@@elem_meta_indexterms@@@ ");
- }
- } else {
- strbf.append(" " + attributes.getValue("content") + " ");
- }
- }
- // OXYGEN PATCH END EXM-20576 - add scoring for indexterms
+ // OXYGEN PATCH START EXM-20576 - add scoring for keywords\r
+ if(attrName != null && (attrName.equalsIgnoreCase("keywords")\r
+ || attrName.equalsIgnoreCase("description")\r
+ || attrName.equalsIgnoreCase("indexterms")\r
+ )){\r
+ if (attrName.equalsIgnoreCase("keywords")) {\r
+ String[] keywords = attributes.getValue("content").split(", ");\r
+ for (String keyword : keywords) {\r
+ strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ ");\r
+ }\r
+ } else if (attrName.equalsIgnoreCase("indexterms")) {\r
+ String[] indexterms = attributes.getValue("content").split(", ");\r
+ for (String indexterm : indexterms) {\r
+ strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ ");\r
+ }\r
+ } else {\r
+ strbf.append(" ").append(attributes.getValue("content") ).append(" ");\r
+ }\r
+ }\r
+ // OXYGEN PATCH END EXM-20576 - add scoring for indexterms\r
// dwc: adding this to make the docbook <abstract> element\r
// (which becomes <meta name="description".../> in html)\r
// into the brief description that shows up in search\r
tempVal = new StringBuffer();\r
}\r
\r
- if(qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc")){\r
- addHeaderInfo = true;\r
- } else {\r
- addHeaderInfo = false;\r
- }\r
+ addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc");\r
\r
- String elementId = attributes.getValue("id"); \r
+ String elementId = attributes.getValue("id");\r
if("content".equals(elementId)) addContent = true;\r
\r
if(addContent) {\r
}\r
\r
String accessKey = attributes.getValue("accesskey");\r
- if(accessKey!=null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey))){\r
- doNotIndex = true;\r
- } else {\r
- doNotIndex = false;\r
- }\r
+ doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey));\r
}\r
strbf.append(" ");\r
}\r
\r
// index certain elements. E.g. Use this to implement a\r
// "titles only" index,\r
- \r
+\r
//OXYGEN PATCH, gather more keywords.\r
if(\r
// (addContent || addHeaderInfo) && \r
// Do a minimal clean\r
text = minimalClean(text, null, null);\r
text = text.replaceAll("\\s+"," ");\r
- String marker = "@@@elem_" + stack.peek() + "@@@ ";
- Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);
+ String marker = "@@@elem_" + stack.peek() + "@@@ ";\r
+ Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);\r
if (text.trim().length() > 0 && m.find()) {\r
- String copyText = new String(originalText);
- text = duplicateWords(copyText, text, "-");
- copyText = new String(originalText);
- text = duplicateWords(copyText, text, ":");
- copyText = new String(originalText);
- text = duplicateWords(copyText, text, ".");
+ String copyText = new String(originalText);\r
+ text = duplicateWords(copyText, text, "-");\r
+ copyText = new String(originalText);\r
+ text = duplicateWords(copyText, text, ":");\r
+ copyText = new String(originalText);\r
+ text = duplicateWords(copyText, text, ".");\r
// Replace whitespace with the marker\r
text = text.replace(" ", marker);\r
text = text + marker;\r
}\r
// END OXYGEN PATCH\r
strbf.append(text);\r
-// System.out.println("=== marked text: " + text);
+// System.out.println("=== marked text: " + text);\r
// START OXYGEN PATCH, append the original text\r
if (tempVal != null) { tempVal.append(originalText);}\r
// END OXYGEN PATCH\r
}\r
}\r
- \r
- // START OXYGEN PATCH EXM-20414
- private String duplicateWords(String sourceText, String acumulator, String separator) {
-// System.out.println("sourceText: " + sourceText + " separator: " + separator);
- int index = sourceText.indexOf(separator);
- while (index >= 0) {
- int indexSpaceAfter = sourceText.indexOf(" ", index);
- String substring = null;
- if (indexSpaceAfter >= 0) {
- substring = sourceText.substring(0, indexSpaceAfter);
- sourceText = sourceText.substring(indexSpaceAfter);
- } else {
- substring = sourceText;
- sourceText = "";
- }
-
- int indexSpaceBefore = substring.lastIndexOf(" ");
- if (indexSpaceBefore >= 0) {
- substring = substring.substring(indexSpaceBefore + 1);
- }
- if (separator.indexOf(".") >= 0) {
- separator = separator.replaceAll("\\.", "\\\\.");
-// System.out.println("++++++++++ separator: " + separator);
- }
- String[] tokens = substring.split(separator);
-
- for (int i = 0; i < tokens.length; i++) {
- acumulator = acumulator + " " + tokens[i];
-// System.out.println("added token: " + tokens[i] + " new text: " + acumulator);
- }
-
- index = sourceText.indexOf(separator);
- }
-
- return acumulator;
- }
- // END OXYGEN PATCH EXM-20414
+\r
+ // START OXYGEN PATCH EXM-20414\r
+ private String duplicateWords(String sourceText, String acumulator, String separator) {\r
+// System.out.println("sourceText: " + sourceText + " separator: " + separator);\r
+ int index = sourceText.indexOf(separator);\r
+ while (index >= 0) {\r
+ int indexSpaceAfter = sourceText.indexOf(" ", index);\r
+ String substring = null;\r
+ if (indexSpaceAfter >= 0) {\r
+ substring = sourceText.substring(0, indexSpaceAfter);\r
+ sourceText = sourceText.substring(indexSpaceAfter);\r
+ } else {\r
+ substring = sourceText;\r
+ sourceText = "";\r
+ }\r
+\r
+ int indexSpaceBefore = substring.lastIndexOf(" ");\r
+ if (indexSpaceBefore >= 0) {\r
+ substring = substring.substring(indexSpaceBefore + 1);\r
+ }\r
+ if (separator.indexOf(".") >= 0) {\r
+ separator = separator.replaceAll("\\.", "\\\\.");\r
+// System.out.println("++++++++++ separator: " + separator);\r
+ }\r
+ String[] tokens = substring.split(separator);\r
+\r
+ for (String token : tokens) {\r
+ acumulator = acumulator + " " + token;\r
+// System.out.println("added token: " + tokens[i] + " new text: " + acumulator);\r
+ }\r
+\r
+ index = sourceText.indexOf(separator);\r
+ }\r
+\r
+ return acumulator;\r
+ }\r
+ // END OXYGEN PATCH EXM-20414\r
public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {\r
// START OXYGEN PATCH, remove element from stack\r
stack.pop();\r
else if (shortdescBool) {\r
shortTagCpt --;\r
if (shortTagCpt == 0) {\r
- String shortdesc = tempVal.toString().replace('\n', ' ');
- if(shortdesc.trim().length() > 0) {
- fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));
- }
+ String shortdesc = tempVal.toString().replace('\n', ' ');\r
+ if(shortdesc.trim().length() > 0) {\r
+ fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));\r
+ }\r
tempVal = null;\r
shortdescBool = false;\r
}\r
}\r
- \r
+\r
if(qName.equalsIgnoreCase("div") && addContent){\r
divCount--;\r
if (divCount == 0) {\r
addContent = false;\r
}\r
- } \r
+ }\r
}\r
- \r
+\r
public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {\r
//do nothing\r
- \r
+\r
}\r
- \r
- /*public InputSource resolveEntity(String publicId, String systemId) \r
+\r
+ /*public InputSource resolveEntity(String publicId, String systemId)\r
throws IOException, SAXException {\r
- \r
+\r
// use the catalog to solve the doctype\r
System.out.println("entities " + publicId + systemId);\r
return null;\r
throws org.xml.sax.SAXException, IOException {\r
//System.out.println("Entities " + publicId + "and" + systemId);\r
// use dita ot (dost.jar) for resolving dtd paths using the calatog\r
- \r
+\r
return null;\r
}\r
\r
/**\r
- * Removes the validation in html files, such as xml version and DTDs \r
- * @param file\r
+ * Removes the validation in html files, such as xml version and DTDs\r
+ * @param file the html file\r
* @return int: returns 0 if no IOException occurs, else 1.\r
*/\r
public String RemoveValidationPI (File file) {\r
int i1, i2;\r
boolean ok = true;\r
try {\r
- \r
+\r
String line = br.readLine();\r
- \r
- \r
+\r
if (line == null) {\r
break;\r
}\r
//ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");\r
- \r
+\r
line = line.replaceAll("\\x26nbsp\\x3B", " ");\r
- \r
+\r
if (!line.contains("<!DOCTYPE html PUBLIC")) {\r
//dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.\r
if (line.contains("<?xml version")) {\r
line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
}\r
\r
- sb.append(line + "\n");\r
- } else \r
+ sb.append(line).append("\n");\r
+ } else\r
{\r
//dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
i1 = line.indexOf("<!DOCTYPE");\r
i2 = line.indexOf(">", i1);\r
while (i2 < 0) {\r
- \r
+\r
line = line.concat(br.readLine());\r
i2 = line.indexOf(">", i1);\r
}\r
String temp = line.substring(i1, i2);\r
- \r
+\r
//ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");\r
if (line.contains("<?xml version")) {\r
line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
{\r
return null;\r
}\r
- \r
+\r
return sb.toString(); // return status\r
\r
}\r
private int SCORING_FOR_BOLD = 5;\r
private int SCORING_FOR_ITALIC = 3;\r
private int SCORING_FOR_NORMAL_TEXT = 1;\r
- private int SCORING_FOR_KEYWORD = 100;
- private int SCORING_FOR_INDEXTERM = 75;
+ private int SCORING_FOR_KEYWORD = 100;\r
+ private int SCORING_FOR_INDEXTERM = 75;\r
\r
/**\r
* The list with the word and scoring object\r
}\r
/**\r
* Constructor\r
- */\r
+ * @param cleanUpStrings\r
+ */\r
public SaxHTMLIndex (ArrayList <String> cleanUpStrings) {\r
super();\r
cleanUpList = cleanUpStrings;\r
}\r
/**\r
* Constructor\r
- */\r
+ * @param cleanUpStrings\r
+ * @param cleanUpChars\r
+ */\r
public SaxHTMLIndex (ArrayList <String> cleanUpStrings, ArrayList <String> cleanUpChars) {\r
super();\r
cleanUpList = cleanUpStrings;\r
\r
/**\r
* Initializer\r
- */\r
+ * @param tempMap\r
+ */\r
public int init(Map<String,String> tempMap){\r
tempDico = tempMap;\r
return 0;\r
//Do Stemming for words in items\r
//TODO currently, stemming support is for english and german only. Add support for other languages as well.\r
\r
- // START OXYGEN PATCH
- wsList = new ArrayList<WordAndScoring>();
+ // START OXYGEN PATCH\r
+ wsList = new ArrayList<WordAndScoring>();\r
// START OXYGEN PATCH, create the words and scoring list\r
// String[] tokenizedItems;\r
// END OXYGEN PATCH\r
|| indexerLanguage.equalsIgnoreCase("ko")){\r
LinkedList<String> tokens = new LinkedList<String>();\r
try{\r
- //EXM-21501 Oxygen patch, replace the extra "@@@"s.
- str = str.replaceAll("@@@([^\\s]*)@@@", "");
+ //EXM-21501 Oxygen patch, replace the extra "@@@"s.\r
+ str = str.replaceAll("@@@([^\\s]*)@@@", "");\r
CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
Reader reader = new StringReader(str);\r
TokenStream stream = analyzer.tokenStream("", reader);\r
while (stream.incrementToken()) {\r
String term = termAtt.term();\r
tokens.add(term);\r
- WordAndScoring ws = new WordAndScoring(term, term, 1);
- boolean found = false;
- for (int i = 0; i < wsList.size(); i++) {
- // If the stem of the current word is already in list,
- // do not add the word in the list, just recompute scoring
- if (wsList.get(i).getStem().equals(ws.getStem())) {
- found = true;
- int scoring = wsList.get(i).getScoring();
- wsList.get(i).setScoring(scoring + ws.getScoring());
- break;
+ WordAndScoring ws = new WordAndScoring(term, term, 1);\r
+ boolean found = false;\r
+ for (WordAndScoring aWsList : wsList) {\r
+ // If the stem of the current word is already in list,\r
+ // do not add the word in the list, just recompute scoring\r
+ if (aWsList.getStem().equals(ws.getStem())) {\r
+ found = true;\r
+ int scoring = aWsList.getScoring();\r
+ aWsList.setScoring(scoring + ws.getScoring());\r
+ break;\r
+ }\r
+\r
+ }\r
+ if (!found) {\r
+ wsList.add(ws);\r
+ }\r
}\r
-\r
- }
- if (!found) {
- wsList.add(ws);
- }
- }
// START OXYGEN PATCH\r
//tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
// END OXYGEN PATCH\r
} else if (indexerLanguage.equalsIgnoreCase("fr")){\r
stemmer= new FrenchStemmer();\r
} else {\r
- stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+ stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used.\r
}\r
// START OXYGEN PATCH\r
wsList = new ArrayList<WordAndScoring>();\r
WordAndScoring ws = getWordAndScoring(token, stemmer, stem);\r
if (ws != null) {\r
boolean found = false;\r
- for (int i = 0; i < wsList.size(); i++) { \r
- // If the stem of the current word is already in list, \r
- // do not add the word in the list, just recompute scoring\r
- if (wsList.get(i).getStem().equals(ws.getStem())) {\r
- found = true;\r
- int scoring = wsList.get(i).getScoring();\r
- wsList.get(i).setScoring(scoring + ws.getScoring());\r
- break;\r
- }\r
- }\r
+ for (WordAndScoring aWsList : wsList) {\r
+ // If the stem of the current word is already in list,\r
+ // do not add the word in the list, just recompute scoring\r
+ if (aWsList.getStem().equals(ws.getStem())) {\r
+ found = true;\r
+ int scoring = aWsList.getScoring();\r
+ aWsList.setScoring(scoring + ws.getScoring());\r
+ break;\r
+ }\r
+ }\r
if (!found) {\r
wsList.add(ws);\r
}\r
;\r
//System.out.println("temp="+s+"="+temp);\r
tempDico.put(s.getStem(), temp);\r
- }else {\r
- String temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
- tempDico.put(s.getStem(), temp);\r
- }\r
+ }else if (s != null) {\r
+ String temp = null;\r
+ temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
+ tempDico.put(s.getStem(), temp);\r
+ }\r
// END OXYGEN PATCH\r
}\r
\r
scoring = SCORING_FOR_ITALIC;\r
} else if ("strong".equalsIgnoreCase(elementName)) {\r
scoring = SCORING_FOR_BOLD;\r
- } else if ("meta_keywords".equalsIgnoreCase(elementName)) {
- scoring = SCORING_FOR_KEYWORD;
- } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {
- scoring = SCORING_FOR_INDEXTERM;
+ } else if ("meta_keywords".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_KEYWORD;\r
+ } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_INDEXTERM;\r
}\r
// Get the stemmed word\r
String stemWord = word;\r
}else {\r
// Clean-up using the props files\r
tempStrBuf.append("\\ba\\b");\r
- Iterator it = cleanUpList.iterator();\r
- while (it.hasNext()){\r
- tempStrBuf.append("|\\b").append(it.next()).append("\\b");\r
- }\r
+ for (String aCleanUp : cleanUpList) {\r
+ tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b");\r
+ }\r
}\r
if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){\r
tempCharBuf.append("\\u3002");\r
- Iterator it = cleanUpPunctuation.iterator();\r
- while (it.hasNext()){\r
- tempCharBuf.append("|"+it.next());
- }\r
+ for (String aCleanUpPunctuation : cleanUpPunctuation) {\r
+ tempCharBuf.append("|").append(aCleanUpPunctuation);\r
+ }\r
}\r
\r
str = minimalClean(str, tempStrBuf, tempCharBuf);\r
return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", "");\r
}\r
\r
- /* replace multiple whitespaces between words with single blank */\r
+ /* replace multiple whitespace between words with single blank */\r
public static String itrim(String source) {\r
return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");\r
}\r
\r
- /* remove all superfluous whitespaces in source string */\r
+ /* remove all superfluous whitespace in source string */\r
public static String rmWhiteSpace(String source) {\r
//System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");\r
return (source==null)? null : itrim(ltrim(rtrim(source)));\r
String [] topicFiles = null;
public static final int MAX_DEPTH = 10;
- public DirList(File inputdir, String regex, int depth) {
+ public DirList(File inputDir, String regexp, int depth) {
try {
listFiles = new ArrayList<File> ();
// not yet implemented
- if(regex == null) {
- for (File f: inputdir.listFiles()) {
+ if(regexp == null) {
+ for (File f: inputDir.listFiles()) {
if (!f.isDirectory()){
listFiles.add(f);
}else {
if (depth < MAX_DEPTH ) {
- DirList nsiDoc = new DirList(f,regex,depth+1);
+ DirList nsiDoc = new DirList(f,regexp,depth+1);
listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
}
}
}
}
else {
- for (File f: inputdir.listFiles(new DirFilter(regex))) {
+ for (File f: inputDir.listFiles(new DirFilter(regexp))) {
listFiles.add(f);
}
// Patch from Oxygen to address problem where directories
// containing . were not traversed.
- for (File f: inputdir.listFiles(new DirFilter(".*"))) {
+ for (File f: inputDir.listFiles(new DirFilter(".*"))) {
if (f.isDirectory()){
if (depth < MAX_DEPTH ) {
- DirList nsiDoc = new DirList(f,regex, depth+1);
+ DirList nsiDoc = new DirList(f,regexp, depth+1);
listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
}
}