From 370fab9f6dd24e3759c4be993e0d3a7565f1a3e6 Mon Sep 17 00:00:00 2001 From: David Cramer Date: Mon, 4 Oct 2010 15:14:20 +0000 Subject: [PATCH] Merged in changes from webhelp branch to address issue #3058244 regarding the xx.html temp file that was being created --- xsl/webhelp/build.xml | 2 - .../com/nexwave/nquindexer/IndexerTask.java | 33 ++++++------ .../nexwave/nquindexer/SaxDocFileParser.java | 50 ++++++++----------- 3 files changed, 39 insertions(+), 46 deletions(-) diff --git a/xsl/webhelp/build.xml b/xsl/webhelp/build.xml index 9148146ea..21a51e244 100755 --- a/xsl/webhelp/build.xml +++ b/xsl/webhelp/build.xml @@ -93,8 +93,6 @@ - - diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java index 23009b88d..d07eece72 100755 --- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java +++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java @@ -11,13 +11,9 @@ import java.util.Iterator; import java.util.Map; import java.util.Properties; -/* - import org.apache.tools.ant.BuildException; import org.apache.tools.ant.Task; -*/ - import com.nexwave.nsidita.DirList; import com.nexwave.nsidita.DocFileInfo; @@ -30,8 +26,7 @@ import com.nexwave.nsidita.DocFileInfo; * @author N. Quaine * @author Kasun Gajasinghe */ -public class IndexerTask{ -//public class IndexerTask extends Task { +public class IndexerTask extends Task { // messages private String txt_no_inputdir = "Input directory not found:"; @@ -61,11 +56,13 @@ public class IndexerTask{ // Indexing features: words to remove private ArrayList cleanUpStrings = null; private ArrayList cleanUpChars = null; + + //Html extension + private String htmlExtension = "html"; // Constructor public IndexerTask() { super(); - } /** The setter for the "htmldir" attribute (parameter of the task) * @param htmldir @@ -75,6 +72,18 @@ public class IndexerTask{ this.htmldir = htmldir; } + /** + * Set the extension in which html files are generated + * @param htmlExtension The extension in wich html files are generated + */ + public void setHtmlextension(String htmlExtension) { + this.htmlExtension = htmlExtension; + //Trim the starting "." + if(this.htmlExtension.startsWith(".")) { + this.htmlExtension = this.htmlExtension.substring(1); + } + } + /** * setter for "indexerLanguage" attribute from ANT * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used. @@ -104,14 +113,11 @@ public class IndexerTask{ IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point. } } - - /** * Implementation of the execute function (Task interface) */ -// public void execute() throws BuildException { - public void execute(){ + public void execute() throws BuildException { try{ //Use Xerces as the parser. Does not support Saxon6.5.5 parser System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser"); @@ -184,8 +190,7 @@ public class IndexerTask{ // Get the list of all html files but the tocs, covers and indexes - //DirList nsiDoc = new DirList(inputDir, "^(?!(toc|index|search|frameset|ix01)).*\\.html$", 1); - DirList nsiDoc = new DirList(inputDir, "^.*\\.html?$", 1); + DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1); htmlFiles = nsiDoc.getListFiles(); // Check if found html files if (htmlFiles.isEmpty()) { @@ -211,7 +216,7 @@ public class IndexerTask{ // Retrieve the clean-up properties for indexing RetrieveCleanUpProps(); - // System.out.print("clean"+" " +cleanUpStrings); + // System.out.print("clean"+" " +cleanUpStrings); //create a default handler //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java index b58053e81..30c3b63e8 100755 --- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java +++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java @@ -1,17 +1,11 @@ package com.nexwave.nquindexer; -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.PrintWriter; +import java.io.*; import com.nexwave.nsidita.BlankRemover; import com.nexwave.nsidita.DocFileInfo; +import org.xml.sax.InputSource; import org.xml.sax.SAXParseException; /** @@ -88,8 +82,11 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { long start = System.currentTimeMillis(); //System.out.println("about to parse " + file.getName() + " >>> " + start); - if ( RemoveValidationPI (file) == 0){ - sp.parse("xx.html", this); + String content = RemoveValidationPI (file); + if (content != null){ + InputSource is = new InputSource(new StringReader(content)); + is.setSystemId(file.toURI().toURL().toString()); + sp.parse(is, this); } long finish = System.currentTimeMillis(); @@ -182,12 +179,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { //triggers when there's character data inside an element. public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { - - // dwc: Bug fix. Don't index contents of script tag. - // dwc: TODO: Add code here to conditionally index or not + // index certain elements. E.g. Use this to implement a - // "titles only" index, say if you wanted to use s to - // create space breaks in ja_JP lines to indicate word breaks. + // "titles only" index, if((addContent || addHeaderInfo) && !doNotIndex && !currentElName.equalsIgnoreCase("script")){ String text = new String(ch,start,length); @@ -245,17 +239,14 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { * @param file * @return int: returns 0 if no IOException occurs, else 1. */ - public int RemoveValidationPI (File file) { - + public String RemoveValidationPI (File file) { + StringBuilder sb = new StringBuilder(); + //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will + // be the source to index the content of the particular html page. try { BufferedReader br = new BufferedReader( new InputStreamReader( new FileInputStream(file),"UTF-8")); - - //PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html"))); - PrintWriter pw = new PrintWriter(new OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8")); - //writes the content to xx.html after removing validation. This temp file will be source to index the - // content of the particular html page. while(true) { @@ -278,7 +269,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { if (line.contains("