From 370fab9f6dd24e3759c4be993e0d3a7565f1a3e6 Mon Sep 17 00:00:00 2001
From: David Cramer <david@thingbag.net>
Date: Mon, 4 Oct 2010 15:14:20 +0000
Subject: [PATCH] Merged in changes from webhelp branch to address issue
 #3058244 regarding the xx.html temp file that was being created

---
 xsl/webhelp/build.xml                         |  2 -
 .../com/nexwave/nquindexer/IndexerTask.java   | 33 ++++++------
 .../nexwave/nquindexer/SaxDocFileParser.java  | 50 ++++++++-----------
 3 files changed, 39 insertions(+), 46 deletions(-)
diff --git a/xsl/webhelp/build.xml b/xsl/webhelp/build.xml
index 9148146ea..21a51e244 100755
--- a/xsl/webhelp/build.xml
+++ b/xsl/webhelp/build.xml
@@ -93,8 +93,6 @@
 	  <fileset dir="${output-dir}/content/search" includes="*.props"/>
 	</delete>
 
-	<delete file="xx.html"/>
-
   </target>
 
   <target name="webhelp" depends="validate,chunk,index"/>
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
index 23009b88d..d07eece72 100755
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
@@ -11,13 +11,9 @@ import java.util.Iterator;
 import java.util.Map;
 import java.util.Properties;
 
-/*
-
 import org.apache.tools.ant.BuildException;
 import org.apache.tools.ant.Task;
 
-*/
-
 
 import com.nexwave.nsidita.DirList;
 import com.nexwave.nsidita.DocFileInfo;
@@ -30,8 +26,7 @@ import com.nexwave.nsidita.DocFileInfo;
  * @author N. Quaine
  * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>
  */
-public class IndexerTask{
-//public class IndexerTask extends Task {
+public class IndexerTask extends Task {
 
 	// messages
 	private String txt_no_inputdir = "Input directory not found:";
@@ -61,11 +56,13 @@ public class IndexerTask{
 	// Indexing features: words to remove
 	private ArrayList<String> cleanUpStrings = null;	
 	private ArrayList<String> cleanUpChars = null;
+
+	//Html extension
+	private String htmlExtension = "html";
 	
 	// Constructor
 	public IndexerTask() {
 		super();
-
 	}
 	/** The setter for the "htmldir" attribute (parameter of the task)
 	 * @param htmldir
@@ -75,6 +72,18 @@ public class IndexerTask{
         this.htmldir = htmldir;
     }
 
+     /**
+     * Set the extension in which html files are generated
+     * @param htmlExtension The extension in wich html files are generated
+     */
+    public void setHtmlextension(String htmlExtension) {
+		this.htmlExtension = htmlExtension;
+		//Trim the starting "."
+		if(this.htmlExtension.startsWith(".")) {
+			this.htmlExtension = this.htmlExtension.substring(1);
+		}
+	}
+
     /**
      * setter for "indexerLanguage" attribute from ANT
      * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.
@@ -104,14 +113,11 @@ public class IndexerTask{
             IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.
         } 
     }
-
-
 	
 	/**
 	 * Implementation of the execute function (Task interface)
 	 */
-//	public void execute() throws BuildException {
-	public void execute(){
+	public void execute() throws BuildException {
         try{
             //Use Xerces as the parser. Does not support Saxon6.5.5 parser 
            System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");
@@ -184,8 +190,7 @@ public class IndexerTask{
 		
 
 		// Get the list of all html files but the tocs, covers and indexes
-		//DirList nsiDoc = new DirList(inputDir, "^(?!(toc|index|search|frameset|ix01)).*\\.html$", 1);
-		DirList nsiDoc = new DirList(inputDir, "^.*\\.html?$", 1);
+        DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);
 		htmlFiles = nsiDoc.getListFiles();
 		// Check if found html files
 		if (htmlFiles.isEmpty()) {
@@ -211,7 +216,7 @@ public class IndexerTask{
 		
 		// Retrieve the clean-up properties for indexing
 		RetrieveCleanUpProps();
-	   // System.out.print("clean"+" " +cleanUpStrings);
+	   	// System.out.print("clean"+" " +cleanUpStrings);
 	    
 		//create a default handler
 		//SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
index b58053e81..30c3b63e8 100755
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
@@ -1,17 +1,11 @@
 package com.nexwave.nquindexer;
 
 
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
+import java.io.*;
 
 import com.nexwave.nsidita.BlankRemover;
 import com.nexwave.nsidita.DocFileInfo;
+import org.xml.sax.InputSource;
 import org.xml.sax.SAXParseException;
 
 /**
@@ -88,8 +82,11 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 			long start = System.currentTimeMillis();
 			//System.out.println("about to parse " + file.getName() + " >>> " + start);
 
-			if ( RemoveValidationPI (file) == 0){
-			    sp.parse("xx.html", this);
+			String content = RemoveValidationPI (file);
+			if (content != null){
+				InputSource is = new InputSource(new StringReader(content));
+				is.setSystemId(file.toURI().toURL().toString());
+			    sp.parse(is, this);
 			}
 			
 			long finish = System.currentTimeMillis();
@@ -182,12 +179,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 
 	//triggers when there's character data inside an element.
 	public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {
-		
-		// dwc: Bug fix. Don't index contents of script tag.
-		// dwc: TODO: Add code here to conditionally index or not
+
 		// index certain elements. E.g. Use this to implement a
-		// "titles only" index, say if you wanted to use <span/>s to
-		// create space breaks in ja_JP lines to indicate word breaks.
+		// "titles only" index,
         
 		if((addContent || addHeaderInfo) && !doNotIndex && !currentElName.equalsIgnoreCase("script")){
 			String text = new String(ch,start,length);
@@ -245,17 +239,14 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
      * @param file
      * @return int: returns 0 if no IOException occurs, else 1.
      */
-	public int RemoveValidationPI (File file) {
-		
+	public String RemoveValidationPI (File file) {
+        StringBuilder sb = new StringBuilder();
+         //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will
+         //  be the source to index the content of the particular html page.
 		try {
 			BufferedReader br = new BufferedReader(
 	                new InputStreamReader(
 	                 new FileInputStream(file),"UTF-8"));
-			
-			//PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html")));
-			PrintWriter pw = new PrintWriter(new  OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8"));
-			 //writes the content to xx.html after removing validation. This temp file will be source to index the
-            // content of the particular html page.
 
 			while(true)
 			{
@@ -278,7 +269,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 						if (line.contains("<?xml version")) {
 							line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");
 						}
-						pw.write(line + "\n");
+
+                        sb.append(line + "\n");
 					} else  
 					{
 						//dwc: What is this trying to do? Nuke the DOCTYPE? Why?
@@ -296,7 +288,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 							line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");
 						}
 						line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n");
-						pw.write(line);
+
+                        sb.append(line);
 					}
 				}
 				catch (IOException e)
@@ -304,18 +297,15 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 					break;
 				}
 			}
-	
-			
-			pw.flush();
-			pw.close();
+
 			br.close();
 		}
 		catch (IOException e)
 		{
-			return 1;
+			return null;
 		}
 		
-		return 0; // return status
+		return sb.toString(); // return status
 
 	}
 
-- 
2.40.0