]> granicus.if.org Git - docbook-dsssl/commitdiff
Merged in changes from webhelp branch to address issue #3058244 regarding the xx...
authorDavid Cramer <david@thingbag.net>
Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
committerDavid Cramer <david@thingbag.net>
Mon, 4 Oct 2010 15:14:20 +0000 (15:14 +0000)
xsl/webhelp/build.xml
xsl/webhelp/indexer/src/com/nexwave/nquindexer/IndexerTask.java
xsl/webhelp/indexer/src/com/nexwave/nquindexer/SaxDocFileParser.java

index 9148146eaa990203cc2f72e10ec150559df2041e..21a51e2443ee5021ef3e31c7886101598f7ae00d 100755 (executable)
@@ -93,8 +93,6 @@
          <fileset dir="${output-dir}/content/search" includes="*.props"/>
        </delete>
 
-       <delete file="xx.html"/>
-
   </target>
 
   <target name="webhelp" depends="validate,chunk,index"/>
index 23009b88d3e82d3298a1052bf03c961f9e5b5baf..d07eece72d5790e5af54de5c341b9df3bfd6bf44 100755 (executable)
@@ -11,13 +11,9 @@ import java.util.Iterator;
 import java.util.Map;\r
 import java.util.Properties;\r
 \r
-/*\r
-\r
 import org.apache.tools.ant.BuildException;\r
 import org.apache.tools.ant.Task;\r
 \r
-*/\r
-\r
 \r
 import com.nexwave.nsidita.DirList;\r
 import com.nexwave.nsidita.DocFileInfo;\r
@@ -30,8 +26,7 @@ import com.nexwave.nsidita.DocFileInfo;
  * @author N. Quaine\r
  * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
  */\r
-public class IndexerTask{\r
-//public class IndexerTask extends Task {\r
+public class IndexerTask extends Task {\r
 \r
        // messages\r
        private String txt_no_inputdir = "Input directory not found:";\r
@@ -61,11 +56,13 @@ public class IndexerTask{
        // Indexing features: words to remove\r
        private ArrayList<String> cleanUpStrings = null;        \r
        private ArrayList<String> cleanUpChars = null;\r
+\r
+       //Html extension\r
+       private String htmlExtension = "html";\r
        \r
        // Constructor\r
        public IndexerTask() {\r
                super();\r
-\r
        }\r
        /** The setter for the "htmldir" attribute (parameter of the task)\r
         * @param htmldir\r
@@ -75,6 +72,18 @@ public class IndexerTask{
         this.htmldir = htmldir;\r
     }\r
 \r
+     /**\r
+     * Set the extension in which html files are generated\r
+     * @param htmlExtension The extension in wich html files are generated\r
+     */\r
+    public void setHtmlextension(String htmlExtension) {\r
+               this.htmlExtension = htmlExtension;\r
+               //Trim the starting "."\r
+               if(this.htmlExtension.startsWith(".")) {\r
+                       this.htmlExtension = this.htmlExtension.substring(1);\r
+               }\r
+       }\r
+\r
     /**\r
      * setter for "indexerLanguage" attribute from ANT\r
      * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.\r
@@ -104,14 +113,11 @@ public class IndexerTask{
             IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.\r
         } \r
     }\r
-\r
-\r
        \r
        /**\r
         * Implementation of the execute function (Task interface)\r
         */\r
-//     public void execute() throws BuildException {\r
-       public void execute(){\r
+       public void execute() throws BuildException {\r
         try{\r
             //Use Xerces as the parser. Does not support Saxon6.5.5 parser \r
            System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");\r
@@ -184,8 +190,7 @@ public class IndexerTask{
                \r
 \r
                // Get the list of all html files but the tocs, covers and indexes\r
-               //DirList nsiDoc = new DirList(inputDir, "^(?!(toc|index|search|frameset|ix01)).*\\.html$", 1);\r
-               DirList nsiDoc = new DirList(inputDir, "^.*\\.html?$", 1);\r
+        DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);\r
                htmlFiles = nsiDoc.getListFiles();\r
                // Check if found html files\r
                if (htmlFiles.isEmpty()) {\r
@@ -211,7 +216,7 @@ public class IndexerTask{
                \r
                // Retrieve the clean-up properties for indexing\r
                RetrieveCleanUpProps();\r
-          // System.out.print("clean"+" " +cleanUpStrings);\r
+               // System.out.print("clean"+" " +cleanUpStrings);\r
            \r
                //create a default handler\r
                //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
index b58053e814bf73fd968d6fe9d86fd2e8e7e4fcfb..30c3b63e88f09c77309bf9ebbf32004ff8d28c3e 100755 (executable)
@@ -1,17 +1,11 @@
 package com.nexwave.nquindexer;\r
 \r
 \r
-import java.io.BufferedReader;\r
-import java.io.File;\r
-import java.io.FileInputStream;\r
-import java.io.FileOutputStream;\r
-import java.io.IOException;\r
-import java.io.InputStreamReader;\r
-import java.io.OutputStreamWriter;\r
-import java.io.PrintWriter;\r
+import java.io.*;\r
 \r
 import com.nexwave.nsidita.BlankRemover;\r
 import com.nexwave.nsidita.DocFileInfo;\r
+import org.xml.sax.InputSource;\r
 import org.xml.sax.SAXParseException;\r
 \r
 /**\r
@@ -88,8 +82,11 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                        long start = System.currentTimeMillis();\r
                        //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
 \r
-                       if ( RemoveValidationPI (file) == 0){\r
-                           sp.parse("xx.html", this);\r
+                       String content = RemoveValidationPI (file);\r
+                       if (content != null){\r
+                               InputSource is = new InputSource(new StringReader(content));\r
+                               is.setSystemId(file.toURI().toURL().toString());\r
+                           sp.parse(is, this);\r
                        }\r
                        \r
                        long finish = System.currentTimeMillis();\r
@@ -182,12 +179,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 \r
        //triggers when there's character data inside an element.\r
        public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
-               \r
-               // dwc: Bug fix. Don't index contents of script tag.\r
-               // dwc: TODO: Add code here to conditionally index or not\r
+\r
                // index certain elements. E.g. Use this to implement a\r
-               // "titles only" index, say if you wanted to use <span/>s to\r
-               // create space breaks in ja_JP lines to indicate word breaks.\r
+               // "titles only" index,\r
         \r
                if((addContent || addHeaderInfo) && !doNotIndex && !currentElName.equalsIgnoreCase("script")){\r
                        String text = new String(ch,start,length);\r
@@ -245,17 +239,14 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
      * @param file\r
      * @return int: returns 0 if no IOException occurs, else 1.\r
      */\r
-       public int RemoveValidationPI (File file) {\r
-               \r
+       public String RemoveValidationPI (File file) {\r
+        StringBuilder sb = new StringBuilder();\r
+         //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
+         //  be the source to index the content of the particular html page.\r
                try {\r
                        BufferedReader br = new BufferedReader(\r
                        new InputStreamReader(\r
                         new FileInputStream(file),"UTF-8"));\r
-                       \r
-                       //PrintWriter pw = new PrintWriter(new FileOutputStream(new File("xx.html")));\r
-                       PrintWriter pw = new PrintWriter(new  OutputStreamWriter (new FileOutputStream(new File("xx.html")),"UTF-8"));\r
-                        //writes the content to xx.html after removing validation. This temp file will be source to index the\r
-            // content of the particular html page.\r
 \r
                        while(true)\r
                        {\r
@@ -278,7 +269,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                                if (line.contains("<?xml version")) {\r
                                                        line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
                                                }\r
-                                               pw.write(line + "\n");\r
+\r
+                        sb.append(line + "\n");\r
                                        } else  \r
                                        {\r
                                                //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
@@ -296,7 +288,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                                        line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
                                                }\r
                                                line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n");\r
-                                               pw.write(line);\r
+\r
+                        sb.append(line);\r
                                        }\r
                                }\r
                                catch (IOException e)\r
@@ -304,18 +297,15 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                        break;\r
                                }\r
                        }\r
-       \r
-                       \r
-                       pw.flush();\r
-                       pw.close();\r
+\r
                        br.close();\r
                }\r
                catch (IOException e)\r
                {\r
-                       return 1;\r
+                       return null;\r
                }\r
                \r
-               return 0; // return status\r
+               return sb.toString(); // return status\r
 \r
        }\r
 \r