Merged in changes from webhelp branch to address issue #3058244 regarding the xx...

author David Cramer <david@thingbag.net>

Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)

committer David Cramer <david@thingbag.net>

Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
author David Cramer <david@thingbag.net>
Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
committer David Cramer <david@thingbag.net>
Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
diff --git a/xsl/webhelp/build.xml b/xsl/webhelp/build.xml

index 9148146eaa990203cc2f72e10ec150559df2041e..21a51e2443ee5021ef3e31c7886101598f7ae00d 100755 (executable)
--- a/xsl/webhelp/build.xml
+++ b/xsl/webhelp/build.xml
@@ -93,8 +93,6 @@
           <fileset dir="${output-dir}/content/search" includes="*.props"/>
         </delete>
  
-       <delete file="xx.html"/>
-
    </target>
  
    <target name="webhelp" depends="validate,chunk,index"/>
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java

index 23009b88d3e82d3298a1052bf03c961f9e5b5baf..d07eece72d5790e5af54de5c341b9df3bfd6bf44 100755 (executable)
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
@@ -11,13 +11,9 @@ import java.util.Iterator;
  import java.util.Map;\r
  import java.util.Properties;\r
  \r
-/*\r
-\r
  import org.apache.tools.ant.BuildException;\r
  import org.apache.tools.ant.Task;\r
  \r
-*/\r
-\r
  \r
  import com.nexwave.nsidita.DirList;\r
  import com.nexwave.nsidita.DocFileInfo;\r
@@ -30,8 +26,7 @@ import com.nexwave.nsidita.DocFileInfo;
   * @author N. Quaine\r
   * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
   */\r
-public class IndexerTask{\r
-//public class IndexerTask extends Task {\r
+public class IndexerTask extends Task {\r
  \r
         // messages\r
         private String txt_no_inputdir = "Input directory not found:";\r
@@ -61,11 +56,13 @@ public class IndexerTask{
         // Indexing features: words to remove\r
         private ArrayList<String> cleanUpStrings = null;        \r
         private ArrayList<String> cleanUpChars = null;\r
+\r
+       //Html extension\r
+       private String htmlExtension = "html";\r
         \r
         // Constructor\r
         public IndexerTask() {\r
                 super();\r
-\r
         }\r
         /** The setter for the "htmldir" attribute (parameter of the task)\r
          * @param htmldir\r
@@ -75,6 +72,18 @@ public class IndexerTask{
          this.htmldir = htmldir;\r
      }\r
  \r
+     /**\r
+     * Set the extension in which html files are generated\r
+     * @param htmlExtension The extension in wich html files are generated\r
+     */\r
+    public void setHtmlextension(String htmlExtension) {\r
+               this.htmlExtension = htmlExtension;\r
+               //Trim the starting "."\r
+               if(this.htmlExtension.startsWith(".")) {\r
+                       this.htmlExtension = this.htmlExtension.substring(1);\r
+               }\r
+       }\r
+\r
      /**\r
       * setter for "indexerLanguage" attribute from ANT\r
       * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.\r
@@ -104,14 +113,11 @@ public class IndexerTask{
              IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.\r
          } \r
      }\r
-\r
-\r
         \r
         /**\r
          * Implementation of the execute function (Task interface)\r
          */\r
-//     public void execute() throws BuildException {\r
-       public void execute(){\r
+       public void execute() throws BuildException {\r
          try{\r
              //Use Xerces as the parser. Does not support Saxon6.5.5 parser \r
             System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");\r
@@ -184,8 +190,7 @@ public class IndexerTask{
                 \r
  \r
                 // Get the list of all html files but the tocs, covers and indexes\r
-               //DirList nsiDoc = new DirList(inputDir, "^(?!(toc|index|search|frameset|ix01)).*\\.html$", 1);\r
-               DirList nsiDoc = new DirList(inputDir, "^.*\\.html?$", 1);\r
+        DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);\r
                 htmlFiles = nsiDoc.getListFiles();\r
                 // Check if found html files\r
                 if (htmlFiles.isEmpty()) {\r
@@ -211,7 +216,7 @@ public class IndexerTask{
                 \r
                 // Retrieve the clean-up properties for indexing\r
                 RetrieveCleanUpProps();\r
-          // System.out.print("clean"+" " +cleanUpStrings);\r
+               // System.out.print("clean"+" " +cleanUpStrings);\r
             \r
                 //create a default handler\r
                 //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
diff --git a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java

index b58053e814bf73fd968d6fe9d86fd2e8e7e4fcfb..30c3b63e88f09c77309bf9ebbf32004ff8d28c3e 100755 (executable)
--- a/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
+++ b/xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
@@ -1,17 +1,11 @@
  package com.nexwave.nquindexer;\r
  \r
  \r
-import java.io.BufferedReader;\r
-import java.io.File;\r
-import java.io.FileInputStream;\r
-import java.io.FileOutputStream;\r
-import java.io.IOException;\r
-import java.io.InputStreamReader;\r
-import java.io.OutputStreamWriter;\r
-import java.io.PrintWriter;\r
+import java.io.*;\r
  \r
  import com.nexwave.nsidita.BlankRemover;\r
  import com.nexwave.nsidita.DocFileInfo;\r
+import org.xml.sax.InputSource;\r
  import org.xml.sax.SAXParseException;\r
  \r
  /**\r
@@ -88,8 +82,11 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                         long start = System.currentTimeMillis();\r
                         //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
  \r
-                       if ( RemoveValidationPI (file) == 0){\r
-                           sp.parse("xx.html", this);\r
+                       String content = RemoveValidationPI (file);\r
+                       if (content != null){\r
+                               InputSource is = new InputSource(new StringReader(content));\r
+                               is.setSystemId(file.toURI().toURL().toString());\r
+                           sp.parse(is, this);\r
                         }\r
                         \r
                         long finish = System.currentTimeMillis();\r
@@ -182,12 +179,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
  \r
         //triggers when there's character data inside an element.\r
         public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
-               \r
-               // dwc: Bug fix. Don't index contents of script tag.\r
-               // dwc: TODO: Add code here to conditionally index or not\r
+\r
                 // index certain elements. E.g. Use this to implement a\r
-               // "titles only" index, say if you wanted to use <span/>s to\r
-               // create space breaks in ja_JP lines to indicate word breaks.\r
+               // "titles only" index,\r
          \r
                 if((addContent || addHeaderInfo) && !doNotIndex && !currentElName.equalsIgnoreCase("script")){\r
                         String text = new String(ch,start,length);\r
@@ -245,17 +239,14 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
       * @param file\r
       * @return int: returns 0 if no IOException occurs, else 1.\r
       */\r
-       public int RemoveValidationPI (File file) {\r
-               \r
+       public String RemoveValidationPI (File file) {\r
+        StringBuilder sb = new StringBuilder();\r
+         //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
+         //  be the source to index the content of the particular html page.\r
                 try {\r
                         BufferedReader br = new BufferedReader(\r
                         new InputStreamReader(\r
                          new FileInputStream(file),"UTF-8"));\r
-                       \r
-                       //PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html")));\r
-                       PrintWriter pw = new PrintWriter(new  OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8"));\r
-                        //writes the content to xx.html after removing validation. This temp file will be source to index the\r
-            // content of the particular html page.\r
  \r
                         while(true)\r
                         {\r
@@ -278,7 +269,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                                 if (line.contains("<?xml version")) {\r
                                                         line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
                                                 }\r
-                                               pw.write(line + "\n");\r
+\r
+                        sb.append(line + "\n");\r
                                         } else  \r
                                         {\r
                                                 //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
@@ -296,7 +288,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                                         line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
                                                 }\r
                                                 line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n");\r
-                                               pw.write(line);\r
+\r
+                        sb.append(line);\r
                                         }\r
                                 }\r
                                 catch (IOException e)\r
@@ -304,18 +297,15 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                         break;\r
                                 }\r
                         }\r
-       \r
-                       \r
-                       pw.flush();\r
-                       pw.close();\r
+\r
                         br.close();\r
                 }\r
                 catch (IOException e)\r
                 {\r
-                       return 1;\r
+                       return null;\r
                 }\r
                 \r
-               return 0; // return status\r
+               return sb.toString(); // return status\r
  \r
         }\r
  \r
author	David Cramer <david@thingbag.net>
	Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
committer	David Cramer <david@thingbag.net>
	Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
xsl/webhelp/build.xml		patch \| blob \| history
xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java		patch \| blob \| history
xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java		patch \| blob \| history