From: Kasun Gajasinghe Date: Mon, 3 Oct 2011 19:07:29 +0000 (+0000) Subject: Webhelpindexer changes - HTML transformation support for WebHelp - Uses Tagsoup for... X-Git-Tag: release/1.79.1~6^2~659 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=615c1f899f9f5e083d55d9da2a385e39e7b19980;p=docbook-dsssl Webhelpindexer changes - HTML transformation support for WebHelp - Uses Tagsoup for parsing the bad html. Tracker - http://sourceforge.net/tracker/?func=detail&aid=3401185&group_id=21935&atid=373750 --- diff --git a/xsl-webhelpindexer/lib/tagsoup-1.2.1.jar b/xsl-webhelpindexer/lib/tagsoup-1.2.1.jar new file mode 100755 index 000000000..275160191 Binary files /dev/null and b/xsl-webhelpindexer/lib/tagsoup-1.2.1.jar differ diff --git a/xsl-webhelpindexer/nbproject/project.properties b/xsl-webhelpindexer/nbproject/project.properties index be92a30d9..3b8886e75 100755 --- a/xsl-webhelpindexer/nbproject/project.properties +++ b/xsl-webhelpindexer/nbproject/project.properties @@ -25,9 +25,11 @@ endorsed.classpath= excludes= file.reference.lucene-analyzers-3.0.0.jar=lib/lucene-analyzers-3.0.0.jar file.reference.lucene-core-3.0.0.jar=lib/lucene-core-3.0.0.jar +file.reference.tagsoup-1.2.1.jar=lib/tagsoup-1.2.1.jar includes=** jar.compress=false javac.classpath=\ + ${file.reference.tagsoup-1.2.1.jar}:\ ${file.reference.lucene-analyzers-3.0.0.jar}:\ ${file.reference.lucene-core-3.0.0.jar}:\ ${ant.home}/lib/ant.jar diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java index 10acbbd04..d22e72138 100644 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java @@ -11,10 +11,10 @@ import java.util.*; /** * Main class of Stand-alone version of WebHelpIndexer * - * User: Kasun Gajasinghe, University of Moratuwa, http://kasunbg.blogspot.com + * User: Kasun Gajasinghe, University of Moratuwa, http://kasunbg.org * Date: Feb 10, 2011 * - * @author Kasun Gajasinghe + * @author Kasun Gajasinghe, University of Moratuwa, http://kasunbg.org */ public class IndexerMain { @@ -37,7 +37,7 @@ public class IndexerMain { private String outputDir = null; private String projectDir = null; - // ANT parameters + // two of the input parameters public String htmlDir = null; public String indexerLanguage = "en"; @@ -93,7 +93,7 @@ public class IndexerMain { setHtmlDir(htmlDir); setIndexerLanguage(indexerLanguage); } - + /** * The content language defaults to English "en" * @@ -175,7 +175,7 @@ public class IndexerMain { System.getProperty("tocFile") ); } else { - throw new RuntimeException("Specify at least the the directory containing html files (htmlDir)\n " + + throw new RuntimeException("Specify at least the directory containing html files (htmlDir)\n " + "ex: java -jar webhelpindexer.jar -DhtmlDir=docs/content -DindexerLanguage=en \n" + "The program will exit now." ); @@ -186,13 +186,24 @@ public class IndexerMain { } /** - * Implementation of the execute function (Task interface) + * The main execution happens here. */ public void execute() { + +/* + //These system properties are set via command-line/ant-script now. See xsl/webhelp/build.xml#index target for + details. try { + //TagSoup SAX HTML Parser which supports parsing even the bad non-xml-conformed HTML + System.setProperty("org.xml.sax.driver", "org.ccil.cowan.tagsoup.Parser"); + //org.ccil.cowan.tagsoup.jaxp.SAXParserImpl + System.setProperty("javax.xml.parsers.SAXParserFactory", "org.ccil.cowan.tagsoup.jaxp.SAXFactoryImpl"); + //Use Xerces as the parser. Does not support Saxon6.5.5 parser - System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser"); - System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.xerces.jaxp.SAXParserFactoryImpl"); +// System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser"); +// System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.xerces.jaxp.SAXParserFactoryImpl"); + + //saxon // System.setProperty("org.xml.sax.driver", "com.icl.saxon.aelfred.SAXDriver"); // System.setProperty("javax.xml.parsers.SAXParserFactory", "com.icl.saxon.aelfred.SAXParserFactoryImpl"); } catch (SecurityException se) { @@ -202,6 +213,7 @@ public class IndexerMain { System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " + "is not in your CLASSPATH"); } + */ ArrayList filesDescription = null; // list of information about the topic files ArrayList htmlFiles = null; // topic files listed in the given directory diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java index a415d268e..9ebec1016 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java @@ -80,7 +80,8 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { javax.xml.parsers.SAXParser sp = spf.newSAXParser(); // deactivate the validation sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false); - sp.getXMLReader().setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); +// sp.getXMLReader().setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); +// this feature isn't supported in TagSoup //parse the file and also register this class for call backs //System.out.println("Parsing: " + file); diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java index c34d4b819..859fe6b76 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java @@ -1,12 +1,6 @@ package com.nexwave.nquindexer; -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.io.OutputStreamWriter; -import java.io.UnsupportedEncodingException; +import java.io.*; import java.util.ArrayList; import java.util.Iterator; import java.util.Map; @@ -31,7 +25,7 @@ public class WriteJSFiles { private static String txt_indices_location = "The created index files are located in "; /** - * Create a javascript array listing the html files with their paths relative to the project root + * Create a JavaScript array listing the html files with their paths relative to the project root * * @param fileO path and name of the file in which to output the list of html files * @param list of the html files, relative to the doc root directory @@ -161,7 +155,7 @@ public class WriteJSFiles { } /** - * Create javascript index files alphabetically. + * Create JavaScript index files alphabetically. * * @param fileOutStr contains the path and the suffix of the index files to create. * The first letter of the key is added to the given suffix. For example: e.g. a.js, b.js etc... @@ -207,9 +201,11 @@ public class WriteJSFiles { The value is the numbers of the files in which the word exists. Example: w["key"]="file1,file2,file3";*/ int count = 0; - if (i == 1) + if (i == 1) { out.write("var indexerLanguage=\"" + indexerLanguage + "\";\n"); - out.write("//Auto generated index for searching.\n"); + } + out.write("//Auto generated index for searching by xsl-webhelpindexer for DocBook Webhelp." + + "# Kasun Gajasinghe, University of Moratuwa\n"); while (keyIt.hasNext()) { //&& (tempLetter == tstr.charAt(0)) out.write("w[\"" + tstr + "\"]" + "=\"" + indexMap.get(tstr) + "\";\n"); tstr = (String) keyIt.next();