import java.util.Map;\r
import java.util.Properties;\r
\r
-/*\r
-\r
import org.apache.tools.ant.BuildException;\r
import org.apache.tools.ant.Task;\r
\r
-*/\r
-\r
\r
import com.nexwave.nsidita.DirList;\r
import com.nexwave.nsidita.DocFileInfo;\r
* @author N. Quaine\r
* @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
*/\r
-public class IndexerTask{\r
-//public class IndexerTask extends Task {\r
+public class IndexerTask extends Task {\r
\r
// messages\r
private String txt_no_inputdir = "Input directory not found:";\r
// Indexing features: words to remove\r
private ArrayList<String> cleanUpStrings = null; \r
private ArrayList<String> cleanUpChars = null;\r
+\r
+ //Html extension\r
+ private String htmlExtension = "html";\r
\r
// Constructor\r
public IndexerTask() {\r
super();\r
-\r
}\r
/** The setter for the "htmldir" attribute (parameter of the task)\r
* @param htmldir\r
this.htmldir = htmldir;\r
}\r
\r
+ /**\r
+ * Set the extension in which html files are generated\r
+ * @param htmlExtension The extension in wich html files are generated\r
+ */\r
+ public void setHtmlextension(String htmlExtension) {\r
+ this.htmlExtension = htmlExtension;\r
+ //Trim the starting "."\r
+ if(this.htmlExtension.startsWith(".")) {\r
+ this.htmlExtension = this.htmlExtension.substring(1);\r
+ }\r
+ }\r
+\r
/**\r
* setter for "indexerLanguage" attribute from ANT\r
* @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.\r
IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.\r
} \r
}\r
-\r
-\r
\r
/**\r
* Implementation of the execute function (Task interface)\r
*/\r
-// public void execute() throws BuildException {\r
- public void execute(){\r
+ public void execute() throws BuildException {\r
try{\r
//Use Xerces as the parser. Does not support Saxon6.5.5 parser \r
System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");\r
\r
\r
// Get the list of all html files but the tocs, covers and indexes\r
- //DirList nsiDoc = new DirList(inputDir, "^(?!(toc|index|search|frameset|ix01)).*\\.html$", 1);\r
- DirList nsiDoc = new DirList(inputDir, "^.*\\.html?$", 1);\r
+ DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);\r
htmlFiles = nsiDoc.getListFiles();\r
// Check if found html files\r
if (htmlFiles.isEmpty()) {\r
\r
// Retrieve the clean-up properties for indexing\r
RetrieveCleanUpProps();\r
- // System.out.print("clean"+" " +cleanUpStrings);\r
+ // System.out.print("clean"+" " +cleanUpStrings);\r
\r
//create a default handler\r
//SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
package com.nexwave.nquindexer;\r
\r
\r
-import java.io.BufferedReader;\r
-import java.io.File;\r
-import java.io.FileInputStream;\r
-import java.io.FileOutputStream;\r
-import java.io.IOException;\r
-import java.io.InputStreamReader;\r
-import java.io.OutputStreamWriter;\r
-import java.io.PrintWriter;\r
+import java.io.*;\r
\r
import com.nexwave.nsidita.BlankRemover;\r
import com.nexwave.nsidita.DocFileInfo;\r
+import org.xml.sax.InputSource;\r
import org.xml.sax.SAXParseException;\r
\r
/**\r
long start = System.currentTimeMillis();\r
//System.out.println("about to parse " + file.getName() + " >>> " + start);\r
\r
- if ( RemoveValidationPI (file) == 0){\r
- sp.parse("xx.html", this);\r
+ String content = RemoveValidationPI (file);\r
+ if (content != null){\r
+ InputSource is = new InputSource(new StringReader(content));\r
+ is.setSystemId(file.toURI().toURL().toString());\r
+ sp.parse(is, this);\r
}\r
\r
long finish = System.currentTimeMillis();\r
\r
//triggers when there's character data inside an element.\r
public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
- \r
- // dwc: Bug fix. Don't index contents of script tag.\r
- // dwc: TODO: Add code here to conditionally index or not\r
+\r
// index certain elements. E.g. Use this to implement a\r
- // "titles only" index, say if you wanted to use <span/>s to\r
- // create space breaks in ja_JP lines to indicate word breaks.\r
+ // "titles only" index,\r
\r
if((addContent || addHeaderInfo) && !doNotIndex && !currentElName.equalsIgnoreCase("script")){\r
String text = new String(ch,start,length);\r
* @param file\r
* @return int: returns 0 if no IOException occurs, else 1.\r
*/\r
- public int RemoveValidationPI (File file) {\r
- \r
+ public String RemoveValidationPI (File file) {\r
+ StringBuilder sb = new StringBuilder();\r
+ //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
+ // be the source to index the content of the particular html page.\r
try {\r
BufferedReader br = new BufferedReader(\r
new InputStreamReader(\r
new FileInputStream(file),"UTF-8"));\r
- \r
- //PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html")));\r
- PrintWriter pw = new PrintWriter(new OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8"));\r
- //writes the content to xx.html after removing validation. This temp file will be source to index the\r
- // content of the particular html page.\r
\r
while(true)\r
{\r
if (line.contains("<?xml version")) {\r
line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
}\r
- pw.write(line + "\n");\r
+\r
+ sb.append(line + "\n");\r
} else \r
{\r
//dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
}\r
line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n");\r
- pw.write(line);\r
+\r
+ sb.append(line);\r
}\r
}\r
catch (IOException e)\r
break;\r
}\r
}\r
- \r
- \r
- pw.flush();\r
- pw.close();\r
+\r
br.close();\r
}\r
catch (IOException e)\r
{\r
- return 1;\r
+ return null;\r
}\r
\r
- return 0; // return status\r
+ return sb.toString(); // return status\r
\r
}\r
\r