]> granicus.if.org Git - docbook-dsssl/commitdiff
optimizing the code
authorKasun Gajasinghe <kasunbg@gmail.com>
Fri, 9 Sep 2011 17:52:26 +0000 (17:52 +0000)
committerKasun Gajasinghe <kasunbg@gmail.com>
Fri, 9 Sep 2011 17:52:26 +0000 (17:52 +0000)
xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java [deleted file]
xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java
xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java

index 8b94207bae7088f44b5c61006443613b6ea3955e..63cbe9c935d4b85f78ec1bcabfd445b92353456d 100644 (file)
@@ -66,7 +66,7 @@ public class IndexerMain {
 
     /**
      * The content language defaults to English "en" 
-     * @param htmlDir The directory where html files resides.
+     * @param htmlDir The directory where html files reside.
      */
     public IndexerMain(String htmlDir) {
         super();
@@ -338,7 +338,6 @@ public class IndexerMain {
                 System.out.println("Delay = " + diff / 1000 + " seconds");
         } else {
             System.out.println(txt_wrong_dita_basedir);
-            return;
         }
     }
 
diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java
deleted file mode 100755 (executable)
index 373e89d..0000000
+++ /dev/null
@@ -1,372 +0,0 @@
-/*\r
-package com.nexwave.nquindexer;\r
-\r
-import java.io.File;\r
-import java.io.FileInputStream;\r
-import java.io.IOException;\r
-import java.util.ArrayList;\r
-import java.util.Collection;\r
-import java.util.Date;\r
-import java.util.HashMap;\r
-import java.util.Iterator;\r
-import java.util.Map;\r
-import java.util.Properties;\r
-\r
-import org.apache.tools.ant.BuildException;\r
-import org.apache.tools.ant.Task;\r
-\r
-\r
-import com.nexwave.nsidita.DirList;\r
-import com.nexwave.nsidita.DocFileInfo;\r
-\r
-*/\r
-/**\r
- * Indexer ant task.\r
- * \r
- * @version 1.0 2008-02-26\r
- * \r
- * @author N. Quaine\r
- * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
- *//*\r
-\r
-public class IndexerTask extends Task {\r
-\r
-       // messages\r
-       private String txt_no_inputdir = "Input directory not found:";\r
-       private String txt_cannot_create_outputdir = "Cannot create output search directory.";\r
-       private String txt_no_files_found = "No html files found.";\r
-       private String txt_wrong_dita_basedir = "ERROR: Parser initialization failed. Wrong dita base dir";\r
-       private String txt_no_relative_files_found= "No relative html files calculated.";\r
-       private String txt_no_words_gathered= "No words have been indexed in";\r
-       private String txt_no_html_files="No HTML Files found in";\r
-       private String txt_no_args="No argument given: you must provide an htmlDir to the IndexerTask";\r
-       \r
-       //working directories\r
-       private String searchdir = "search";\r
-       private File inputDir = null;\r
-       private String outputDir = null;\r
-       private String projectDir = null;\r
-\r
-       // ANT parameters\r
-       private String htmlDir=null;\r
-    public static String indexerLanguage="en";\r
-\r
-    //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,\r
-    // as stemmers doesn't find a difference between them.\r
-    private String[] supportedLanguages= {"en", "de", "fr", "zh", "ja", "ko"}; //currently extended support available for\r
-                // English, German, French and CJK (Chinese [zh], Japanese [ja], Korean [ko]) languages only.\r
-\r
-       // Indexing features: words to remove\r
-       private ArrayList<String> cleanUpStrings = null;        \r
-       private ArrayList<String> cleanUpChars = null;\r
-\r
-       //Html extension\r
-       private String htmlExtension = "html";\r
-       \r
-       // Constructor\r
-       public IndexerTask() {\r
-               super();\r
-       }\r
-       */\r
-/** The setter for the "htmlDir" attribute (parameter of the task)\r
-        * @param htmldir\r
-        * @throws InterruptedException \r
-        *//*\r
-\r
-    public void setHtmlDir(String htmlDir) {\r
-        this.htmlDir = htmlDir;\r
-    }\r
-\r
-     */\r
-/**\r
-     * Set the extension in which html files are generated\r
-     * @param htmlExtension The extension in wich html files are generated\r
-     *//*\r
-\r
-    public void setHtmlextension(String htmlExtension) {\r
-               this.htmlExtension = htmlExtension;\r
-               //Trim the starting "."\r
-               if(this.htmlExtension.startsWith(".")) {\r
-                       this.htmlExtension = this.htmlExtension.substring(1);\r
-               }\r
-       }\r
-\r
-    */\r
-/**\r
-     * setter for "indexerLanguage" attribute from ANT\r
-     * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.\r
-     * @throws InterruptedException for ant\r
-     *//*\r
-\r
-    public void setIndexerLanguage(String indexerLanguage){\r
-        if(indexerLanguage !=null && !"".equals(indexerLanguage)) {\r
-            int temp = indexerLanguage.indexOf('_');\r
-            if( temp != -1){\r
-                indexerLanguage = indexerLanguage.substring(0,temp);\r
-            }\r
-            int i=0;\r
-            for (;i<supportedLanguages.length;i++) {\r
-                if(indexerLanguage.equals(supportedLanguages[i])){\r
-                    IndexerTask.indexerLanguage = supportedLanguages[i];\r
-                    break;\r
-                }\r
-            }\r
-            \r
-            //if not in supported language list,\r
-            if(i>=supportedLanguages.length){\r
-//                System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " +\r
-//                        "searching. Check documentation for details. ");\r
-                IndexerTask.indexerLanguage = indexerLanguage;\r
-            } \r
-        } else {\r
-            IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.\r
-        } \r
-    }\r
-       \r
-       */\r
-/**\r
-        * Implementation of the execute function (Task interface)\r
-        *//*\r
-\r
-       public void execute() throws BuildException {\r
-        try{\r
-            //Use Xerces as the parser. Does not support Saxon6.5.5 parser \r
-           System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");\r
-           System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.xerces.jaxp.SAXParserFactoryImpl");\r
-//           System.setProperty("org.xml.sax.driver", "com.icl.saxon.aelfred.SAXDriver");\r
-//           System.setProperty("javax.xml.parsers.SAXParserFactory", "com.icl.saxon.aelfred.SAXParserFactoryImpl");\r
-        } catch (SecurityException se){\r
-            System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " +\r
-                    "is not in your CLASSPATH.");\r
-        } catch (Exception e){\r
-            System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " +\r
-                    "is not in your CLASSPATH");\r
-        }\r
-\r
-               ArrayList<DocFileInfo> filesDescription = null; // list of information about the topic files\r
-               ArrayList<File> htmlFiles = null; // topic files listed in the given directory\r
-               ArrayList<String> htmlFilesPathRel = null;\r
-               Map<String, String> tempDico = new HashMap<String, String>(); \r
-               Iterator it;\r
-               \r
-               //File name initialization\r
-               String htmlList = "htmlFileList.js";\r
-               String htmlInfoList = "htmlFileInfoList.js";\r
-               String indexName = ".js";\r
-               \r
-               //timing\r
-               Date dateStart = new Date();\r
-               \r
-               if (htmlDir == null) {\r
-                       System.out.println(txt_no_args + ".");\r
-                       return;\r
-               }\r
-               // Init input directory\r
-               inputDir = new File(htmlDir);\r
-\r
-               // Begin of init\r
-               // check if inputdir initialized\r
-               if (inputDir == null) {\r
-                       DisplayHelp();\r
-                       return;\r
-               }\r
-               \r
-               // check if inputdir exists             \r
-               if (!inputDir.exists()) {\r
-                       System.out.println(txt_no_inputdir + " "+ inputDir + ".");\r
-                       return;\r
-               }\r
-               \r
-               // check if outputdir defined\r
-               if (outputDir == null) {\r
-            //set the output directory: path= {inputDir}/search \r
-                       outputDir = inputDir.getPath().concat(File.separator).concat(searchdir);\r
-               }\r
-\r
-               // check if outputdir exists\r
-               File tempfile = new File(outputDir); \r
-               if (!tempfile.exists()) {\r
-                       boolean b = (new File(outputDir)).mkdir();\r
-                       if (!b) {\r
-                               System.out.println(txt_cannot_create_outputdir + " "+ outputDir + ".");\r
-                               return;\r
-                       }\r
-               }\r
-               \r
-               // check if projdir is defined\r
-               if (projectDir == null) {\r
-                       projectDir = inputDir.getPath();\r
-               }\r
-               //end of init\r
-               \r
-\r
-               // Get the list of all html files but the tocs, covers and indexes\r
-        DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);\r
-               htmlFiles = nsiDoc.getListFiles();\r
-               // Check if found html files\r
-               if (htmlFiles.isEmpty()) {\r
-                       System.out.println(txt_no_html_files + " "+ inputDir + ".");\r
-                       return;\r
-               }\r
-               // Get the list of all html files with relative paths \r
-               htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir);\r
-               \r
-               if (htmlFiles == null) {\r
-                       System.out.println(txt_no_files_found);\r
-                       return;\r
-               } else if (htmlFilesPathRel == null) {\r
-                       System.out.println(txt_no_relative_files_found);\r
-                       return;                 \r
-               }\r
-               \r
-               // Create the list of the existing html files (index starts at 0)\r
-               WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel);\r
-               \r
-               // Parse each html file to retrieve the words:\r
-               // ------------------------------------------\r
-               \r
-               // Retrieve the clean-up properties for indexing\r
-               RetrieveCleanUpProps();\r
-               // System.out.print("clean"+" " +cleanUpStrings);\r
-           \r
-               //create a default handler\r
-               //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files\r
-               //SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings); // use clean-up props files\r
-               SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings, cleanUpChars); // use clean-up props files\r
-\r
-               if ( spe.init(tempDico) == 0 ) {\r
-\r
-                       //create a html file description list\r
-                       filesDescription = new ArrayList <DocFileInfo> ();\r
-                       \r
-                       it = htmlFiles.iterator ( ) ;\r
-                       \r
-                       // parse each html files\r
-                       while ( it.hasNext ( ) ) {\r
-                               File ftemp = (File) it.next();\r
-                               //tempMap.put(key, value);\r
-                               //The HTML file information are added in the list of FileInfoObject\r
-                               DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage));\r
-                               \r
-                               ftemp = docFileInfoTemp.getFullpath();\r
-                               String stemp = ftemp.toString();              \r
-                               int i = stemp.indexOf(projectDir);\r
-                               if ( i != 0 ) {\r
-                                       System.out.println("the documentation root does not match with the documentation input!");\r
-                                       return;\r
-                               }\r
-                               int ad = 1;\r
-                               if (stemp.equals(projectDir)) ad = 0; \r
-                               stemp = stemp.substring(i+projectDir.length()+ad);  //i is redundant (i==0 always)\r
-                               ftemp = new File (stemp);\r
-                               docFileInfoTemp.setFullpath(ftemp);\r
-                               \r
-                               filesDescription.add(docFileInfoTemp);\r
-                       }\r
-                       */\r
-/*remove empty strings from the map*//*\r
-\r
-                       if (tempDico.containsKey("")) {\r
-                               tempDico.remove("");\r
-                       }\r
-                       // write the index files\r
-                       if (tempDico.isEmpty()) {\r
-                               System.out.println(txt_no_words_gathered + " "+ inputDir + ".");\r
-                               return;\r
-                       }\r
-                       \r
-                       WriteJSFiles.WriteIndex(outputDir.concat(File.separator).concat(indexName), tempDico);\r
-                       \r
-                       // write the html list file with title and shortdesc\r
-                       //create the list of the existing html files (index starts at 0)\r
-                       WriteJSFiles.WriteHTMLInfoList(outputDir.concat(File.separator).concat(htmlInfoList), filesDescription);\r
-                       \r
-                       //perf measurement\r
-                       Date dateEnd = new Date();\r
-                       long diff = dateEnd.getTime() - dateStart.getTime();\r
-            if(diff<1000)\r
-                           System.out.println("Delay = " + diff + " milliseconds");\r
-            else\r
-                System.out.println("Delay = " + diff/1000 + " seconds");\r
-               }else {\r
-                       System.out.println(txt_wrong_dita_basedir);\r
-                       return;\r
-               }\r
-       }\r
-       \r
-       */\r
-/**\r
-     * Prints the usage information for this class to <code>System.out</code>.\r
-     *//*\r
-\r
-    private static void DisplayHelp() {\r
-       String lSep = System.getProperty("line.separator");\r
-        StringBuffer msg = new StringBuffer();\r
-        msg.append("USAGE:" + lSep);        \r
-        msg.append("   java -classpath TesterIndexer inputDir outputDir projectDir" + lSep);\r
-        msg.append("with:" + lSep);\r
-        msg.append("   inputDir (mandatory) :  specify the html files ' directory to index" + lSep);\r
-        msg.append("   outputDir (optional) : specify where to output the index files" + lSep);\r
-        msg.append("   projectDir (optional) : specify the root of the documentation directory" + lSep);\r
-        msg.append("Example:" + lSep);\r
-        msg.append("   java -classpath TesterIndexer /home/$USER/DITA/doc" + lSep);\r
-        msg.append("Example 2:" + lSep);\r
-        msg.append("   java -classpath TesterIndexer /home/$USER/DITA/doc/customer/concepts /home/$USER/temp/search /home/$USER/DITA/doc/" + lSep);\r
-        System.out.println(msg.toString());\r
-    }\r
-    private int RetrieveCleanUpProps (){\r
-\r
-       // Files for punctuation (only one for now)\r
-        String[] punctuationFiles = new String[] {"punctuation.props"};\r
-        FileInputStream input;\r
-        String tempStr;\r
-        File ftemp;\r
-        Collection c = new ArrayList<String>();\r
-\r
-        // Get the list of the props file containing the words to remove (not the punctuation)\r
-        DirList props = new DirList(inputDir, "^(?!(punctuation)).*\\.props$", 1);\r
-               ArrayList<File> wordsList = props.getListFiles();\r
-//             System.out.println("props files:"+wordsList);\r
-        //TODO all properties are taken to a single arraylist. does it ok?.\r
-               Properties enProps =new Properties ();\r
-               String propsDir = inputDir.getPath().concat(File.separator).concat(searchdir);\r
-               \r
-               // Init the lists which will contain the words and chars to remove \r
-               cleanUpStrings = new ArrayList<String>();\r
-               cleanUpChars = new ArrayList<String>();\r
-               \r
-           try {\r
-               // Retrieve words to remove\r
-            for (File aWordsList : wordsList) {\r
-                ftemp = aWordsList;\r
-                if (ftemp.exists()) {\r
-                    enProps.load(input = new FileInputStream(ftemp.getAbsolutePath()));\r
-                    input.close();\r
-                    c = enProps.values();\r
-                    cleanUpStrings.addAll(c);\r
-                    enProps.clear();\r
-                }\r
-            }\r
-\r
-               // Retrieve char to remove (punctuation for ex.)\r
-            for (String punctuationFile : punctuationFiles) {\r
-                tempStr = propsDir.concat(File.separator).concat(punctuationFile);\r
-                ftemp = new File(tempStr);\r
-                if (ftemp.exists()) {\r
-                    enProps.load(input = new FileInputStream(tempStr));\r
-                    input.close();\r
-                    c = enProps.values();\r
-                    cleanUpChars.addAll(c);\r
-                    enProps.clear();\r
-                }\r
-            }\r
-           }\r
-           catch (IOException e) {\r
-               e.printStackTrace();\r
-               return 1;\r
-           }\r
-       return 0;\r
-    }\r
-\r
-}\r
-*/\r
index ca808d52967d621b29d0c1edad1f9f351cc88747..a24cc18555829a64ef97fa3f9e186721e8bae6f8 100755 (executable)
@@ -13,14 +13,14 @@ import org.xml.sax.SAXParseException;
 \r
 /**\r
  * Generic parser for populating a DocFileInfo object.\r
- * \r
+ *\r
  * @version 2.0 2010-08-14\r
- * \r
+ *\r
  * @author N. Quaine\r
  * @author Kasun Gajasinghe\r
  */\r
 public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {\r
-       \r
+\r
        //members\r
        protected DocFileInfo fileDesc = null;\r
        protected String projectDir = null;\r
@@ -39,7 +39,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
        public SaxDocFileParser () {\r
 \r
        }\r
-       \r
+\r
        /**\r
         * Initializer\r
         */\r
@@ -48,16 +48,16 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
        }\r
 \r
        /**\r
-        * Parses the file to extract all the words for indexing and \r
-        * some data characterizing the file. \r
-        * @param file contains the fullpath of the document to parse  \r
+        * Parses the file to extract all the words for indexing and\r
+        * some data characterizing the file.\r
+        * @param file contains the fullpath of the document to parse\r
         * @return a DitaFileInfo object filled with data describing the file\r
         */\r
        public DocFileInfo runExtractData(File file) {\r
                //initialization\r
                fileDesc = new DocFileInfo(file);\r
                strbf = new StringBuffer("");\r
-               \r
+\r
                // Fill strbf by parsing the file\r
                parseDocument(file);\r
 \r
@@ -67,7 +67,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
        public void parseDocument (File file) {\r
 //        System.out.println(System.getProperty("org.xml.sax.driver"));\r
 //        System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory"));\r
-        \r
+\r
                //get a factory\r
                javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();\r
 \r
@@ -83,7 +83,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 \r
             //parse the file and also register this class for call backs\r
                        //System.out.println("Parsing: " + file);\r
-                       \r
+\r
                        long start = System.currentTimeMillis();\r
                        //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
 \r
@@ -93,25 +93,25 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                is.setSystemId(file.toURI().toURL().toString());\r
                            sp.parse(is, this);\r
                        }\r
-                       \r
+\r
                        long finish = System.currentTimeMillis();\r
                        //System.out.println("done parsing " + file.getName() + " >>> " + finish);\r
                        //System.out.println("time = " + (finish - start) + " milliseconds");\r
-                       \r
+\r
                }catch(SAXParseException spe){\r
             System.out.println("SaxParseException: The indexing file contains incorrect xml syntax.");\r
             spe.printStackTrace();\r
         }catch(org.xml.sax.SAXException se) {\r
                        System.out.println("SaxException. You may need to include Xerces in your classpath. " +\r
                     "See documentation for details");\r
-                       se.printStackTrace(); \r
+                       se.printStackTrace();\r
                }catch(javax.xml.parsers.ParserConfigurationException pce) {\r
                        pce.printStackTrace();\r
                }catch (IOException ie) {\r
                        ie.printStackTrace();\r
                }\r
        }\r
-    \r
+\r
     private boolean addContent = false;\r
     private boolean addHeaderInfo = false;\r
     private boolean doNotIndex=false;\r
@@ -129,26 +129,26 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                if((qName.equalsIgnoreCase("meta")) ) {\r
             addHeaderInfo = true;\r
                        String attrName = attributes.getValue("name");\r
-                       // OXYGEN PATCH START EXM-20576 - add scoring for keywords
-                       if(attrName != null && (attrName.equalsIgnoreCase("keywords") 
-                               || attrName.equalsIgnoreCase("description")
-                               || attrName.equalsIgnoreCase("indexterms")
-                               )){
-                           if (attrName.equalsIgnoreCase("keywords")) {
-                               String[] keywords = attributes.getValue("content").split(", ");
-                               for (int i = 0; i < keywords.length; i++) {
-                                   strbf.append(" " + keywords[i] + "@@@elem_meta_keywords@@@ ");
-                               }
-                           } else if (attrName.equalsIgnoreCase("indexterms")) {
-                               String[] indexterms = attributes.getValue("content").split(", ");
-                               for (int i = 0; i < indexterms.length; i++) {
-                                   strbf.append(" " + indexterms[i] + "@@@elem_meta_indexterms@@@ ");
-                               }
-                           } else {
-                               strbf.append(" " + attributes.getValue("content") + " ");
-                           }
-                       } 
-                       // OXYGEN PATCH END EXM-20576 - add scoring for indexterms
+                       // OXYGEN PATCH START EXM-20576 - add scoring for keywords\r
+                       if(attrName != null && (attrName.equalsIgnoreCase("keywords")\r
+                               || attrName.equalsIgnoreCase("description")\r
+                               || attrName.equalsIgnoreCase("indexterms")\r
+                               )){\r
+                           if (attrName.equalsIgnoreCase("keywords")) {\r
+                               String[] keywords = attributes.getValue("content").split(", ");\r
+                    for (String keyword : keywords) {\r
+                        strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ ");\r
+                    }\r
+                           } else if (attrName.equalsIgnoreCase("indexterms")) {\r
+                               String[] indexterms = attributes.getValue("content").split(", ");\r
+                    for (String indexterm : indexterms) {\r
+                        strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ ");\r
+                    }\r
+                           } else {\r
+                               strbf.append(" ").append(attributes.getValue("content") ).append(" ");\r
+                           }\r
+                       }\r
+                       // OXYGEN PATCH END EXM-20576 - add scoring for indexterms\r
                        // dwc: adding this to make the docbook <abstract> element\r
                        // (which becomes <meta name="description".../> in html)\r
                        // into the brief description that shows up in search\r
@@ -163,13 +163,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                        tempVal = new StringBuffer();\r
                }\r
 \r
-        if(qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc")){\r
-            addHeaderInfo = true;\r
-        } else {\r
-            addHeaderInfo = false;\r
-        }\r
+        addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc");\r
 \r
-        String elementId = attributes.getValue("id"); \r
+        String elementId = attributes.getValue("id");\r
         if("content".equals(elementId)) addContent = true;\r
 \r
         if(addContent) {\r
@@ -193,11 +189,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
             }\r
 \r
             String accessKey = attributes.getValue("accesskey");\r
-            if(accessKey!=null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey))){\r
-                doNotIndex = true;\r
-            } else {\r
-                doNotIndex = false;\r
-            }\r
+            doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey));\r
         }\r
                strbf.append(" ");\r
        }\r
@@ -207,7 +199,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 \r
                // index certain elements. E.g. Use this to implement a\r
                // "titles only" index,\r
-        \r
+\r
         //OXYGEN PATCH, gather more keywords.\r
                if(\r
 //                             (addContent || addHeaderInfo) && \r
@@ -221,64 +213,64 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                        // Do a minimal clean\r
                        text = minimalClean(text, null, null);\r
                        text = text.replaceAll("\\s+"," ");\r
-                       String marker = "@@@elem_" + stack.peek() + "@@@ ";
-                       Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);
+                       String marker = "@@@elem_" + stack.peek() + "@@@ ";\r
+                       Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);\r
                        if (text.trim().length() > 0 && m.find()) {\r
-                           String copyText = new String(originalText);
-                           text = duplicateWords(copyText, text, "-");
-                           copyText = new String(originalText);
-                           text = duplicateWords(copyText, text, ":");
-                           copyText = new String(originalText);
-                           text = duplicateWords(copyText, text, ".");
+                           String copyText = new String(originalText);\r
+                           text = duplicateWords(copyText, text, "-");\r
+                           copyText = new String(originalText);\r
+                           text = duplicateWords(copyText, text, ":");\r
+                           copyText = new String(originalText);\r
+                           text = duplicateWords(copyText, text, ".");\r
                                // Replace whitespace with the marker\r
                                text = text.replace(" ", marker);\r
                                text = text + marker;\r
                        }\r
                        // END OXYGEN PATCH\r
                        strbf.append(text);\r
-//                     System.out.println("=== marked text: " + text);
+//                     System.out.println("=== marked text: " + text);\r
                        // START OXYGEN PATCH, append the original text\r
                        if (tempVal != null) { tempVal.append(originalText);}\r
                        // END OXYGEN PATCH\r
                }\r
        }\r
-       \r
-       // START OXYGEN PATCH EXM-20414
-       private String duplicateWords(String sourceText, String acumulator, String separator) {
-//         System.out.println("sourceText: " + sourceText + "   separator: " + separator);
-           int index = sourceText.indexOf(separator);
-           while (index >= 0) {
-               int indexSpaceAfter = sourceText.indexOf(" ", index);
-               String substring = null;
-               if (indexSpaceAfter >= 0) {
-                   substring = sourceText.substring(0, indexSpaceAfter);
-                   sourceText = sourceText.substring(indexSpaceAfter);
-               } else {
-                   substring = sourceText;
-                   sourceText = "";
-               }
-               
-               int indexSpaceBefore = substring.lastIndexOf(" ");
-               if (indexSpaceBefore >= 0) {
-                   substring = substring.substring(indexSpaceBefore + 1);
-               }
-               if (separator.indexOf(".") >= 0) {
-                   separator = separator.replaceAll("\\.", "\\\\.");
-//                 System.out.println("++++++++++ separator: " + separator);
-               }
-               String[] tokens = substring.split(separator);
-
-               for (int i = 0; i < tokens.length; i++) {
-                   acumulator = acumulator + " " + tokens[i];
-//                 System.out.println("added token: " + tokens[i] + "  new text: " + acumulator);
-               }
-               
-               index = sourceText.indexOf(separator);
-           }
-           
-           return acumulator;
-       }
-       // END OXYGEN PATCH EXM-20414
+\r
+       // START OXYGEN PATCH EXM-20414\r
+       private String duplicateWords(String sourceText, String acumulator, String separator) {\r
+//         System.out.println("sourceText: " + sourceText + "   separator: " + separator);\r
+           int index = sourceText.indexOf(separator);\r
+           while (index >= 0) {\r
+               int indexSpaceAfter = sourceText.indexOf(" ", index);\r
+               String substring = null;\r
+               if (indexSpaceAfter >= 0) {\r
+                   substring = sourceText.substring(0, indexSpaceAfter);\r
+                   sourceText = sourceText.substring(indexSpaceAfter);\r
+               } else {\r
+                   substring = sourceText;\r
+                   sourceText = "";\r
+               }\r
+\r
+               int indexSpaceBefore = substring.lastIndexOf(" ");\r
+               if (indexSpaceBefore >= 0) {\r
+                   substring = substring.substring(indexSpaceBefore + 1);\r
+               }\r
+               if (separator.indexOf(".") >= 0) {\r
+                   separator = separator.replaceAll("\\.", "\\\\.");\r
+//                 System.out.println("++++++++++ separator: " + separator);\r
+               }\r
+               String[] tokens = substring.split(separator);\r
+\r
+            for (String token : tokens) {\r
+                acumulator = acumulator + " " + token;\r
+//                 System.out.println("added token: " + tokens[i] + "  new text: " + acumulator);\r
+            }\r
+\r
+            index = sourceText.indexOf(separator);\r
+           }\r
+\r
+           return acumulator;\r
+       }\r
+       // END OXYGEN PATCH EXM-20414\r
        public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {\r
                // START OXYGEN PATCH, remove element from stack\r
                stack.pop();\r
@@ -292,31 +284,31 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                else if (shortdescBool) {\r
                        shortTagCpt --;\r
                        if (shortTagCpt == 0) {\r
-                               String shortdesc = tempVal.toString().replace('\n', ' ');
-                               if(shortdesc.trim().length() > 0) {
-                                       fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));
-                               }
+                               String shortdesc = tempVal.toString().replace('\n', ' ');\r
+                               if(shortdesc.trim().length() > 0) {\r
+                                       fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));\r
+                               }\r
                        tempVal = null;\r
                        shortdescBool = false;\r
                        }\r
                }\r
-        \r
+\r
         if(qName.equalsIgnoreCase("div") && addContent){\r
             divCount--;\r
             if (divCount == 0) {\r
                 addContent = false;\r
             }\r
-        } \r
+        }\r
        }\r
-       \r
+\r
        public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {\r
                //do nothing\r
-               \r
+\r
        }\r
-       \r
-       /*public InputSource resolveEntity(String publicId, String systemId) \r
+\r
+       /*public InputSource resolveEntity(String publicId, String systemId)\r
                throws IOException, SAXException {\r
-               \r
+\r
                // use the catalog to solve the doctype\r
                System.out.println("entities " + publicId + systemId);\r
                return null;\r
@@ -325,13 +317,13 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
        throws org.xml.sax.SAXException, IOException {\r
                //System.out.println("Entities " + publicId + "and" + systemId);\r
                // use dita ot (dost.jar) for resolving dtd paths using the calatog\r
-               \r
+\r
        return null;\r
        }\r
 \r
     /**\r
-     * Removes the validation in html files, such as xml version and DTDs  \r
-     * @param file\r
+     * Removes the validation in html files, such as xml version and DTDs\r
+     * @param file the html file\r
      * @return int: returns 0 if no IOException occurs, else 1.\r
      */\r
        public String RemoveValidationPI (File file) {\r
@@ -348,36 +340,35 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                                int i1, i2;\r
                                boolean ok = true;\r
                                try {\r
-       \r
+\r
                                        String line = br.readLine();\r
-       \r
-                               \r
+\r
                                        if (line == null) {\r
                                                break;\r
                                        }\r
                                        //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");\r
-                                       \r
+\r
                                        line = line.replaceAll("\\x26nbsp\\x3B", "&#160;");\r
-       \r
+\r
                                        if (!line.contains("<!DOCTYPE html PUBLIC")) {\r
                                                //dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.\r
                                                if (line.contains("<?xml version")) {\r
                                                        line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
                                                }\r
 \r
-                        sb.append(line + "\n");\r
-                                       } else  \r
+                        sb.append(line).append("\n");\r
+                                       } else\r
                                        {\r
                                                //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
                                                i1 = line.indexOf("<!DOCTYPE");\r
                                                i2 = line.indexOf(">", i1);\r
                                                while (i2 < 0) {\r
-                                                       \r
+\r
                                                        line = line.concat(br.readLine());\r
                                                        i2 = line.indexOf(">", i1);\r
                                                }\r
                                                String temp = line.substring(i1, i2);\r
-                                               \r
+\r
                                                //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");\r
                                                if (line.contains("<?xml version")) {\r
                                                        line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
@@ -399,7 +390,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
                {\r
                        return null;\r
                }\r
-               \r
+\r
                return sb.toString(); // return status\r
 \r
        }\r
index abac4aeff594f953256c2afd4ba869cf2051983f..c57bbf7cd2b258b388a8c2c2e54924425d5cc81a 100755 (executable)
@@ -67,8 +67,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
        private int SCORING_FOR_BOLD = 5;\r
        private int SCORING_FOR_ITALIC = 3;\r
        private int SCORING_FOR_NORMAL_TEXT = 1;\r
-       private int SCORING_FOR_KEYWORD = 100;
-       private int SCORING_FOR_INDEXTERM = 75;
+       private int SCORING_FOR_KEYWORD = 100;\r
+       private int SCORING_FOR_INDEXTERM = 75;\r
        \r
        /**\r
         * The list with the word and scoring object\r
@@ -92,14 +92,17 @@ public class SaxHTMLIndex extends SaxDocFileParser{
        }\r
        /**\r
         * Constructor\r
-        */\r
+     * @param cleanUpStrings\r
+     */\r
        public SaxHTMLIndex (ArrayList <String> cleanUpStrings) {\r
                super();\r
                cleanUpList = cleanUpStrings;\r
        }\r
        /**\r
         * Constructor\r
-        */\r
+     * @param cleanUpStrings\r
+     * @param cleanUpChars\r
+     */\r
        public SaxHTMLIndex (ArrayList <String> cleanUpStrings, ArrayList <String> cleanUpChars) {\r
                super();\r
                cleanUpList = cleanUpStrings;\r
@@ -108,7 +111,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 \r
        /**\r
         * Initializer\r
-        */\r
+     * @param tempMap\r
+     */\r
        public int init(Map<String,String> tempMap){\r
                tempDico = tempMap;\r
                return 0;\r
@@ -142,8 +146,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
         //Do Stemming for words in items\r
         //TODO currently, stemming support is for english and german only. Add support for other languages as well.\r
 \r
-        // START OXYGEN PATCH
-        wsList = new ArrayList<WordAndScoring>();
+        // START OXYGEN PATCH\r
+        wsList = new ArrayList<WordAndScoring>();\r
         // START OXYGEN PATCH, create the words and scoring list\r
 //        String[] tokenizedItems;\r
         // END OXYGEN PATCH\r
@@ -151,8 +155,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 || indexerLanguage.equalsIgnoreCase("ko")){\r
                 LinkedList<String> tokens = new LinkedList<String>();\r
             try{\r
-               //EXM-21501 Oxygen patch, replace the extra "@@@"s.
-               str = str.replaceAll("@@@([^\\s]*)@@@", "");
+               //EXM-21501 Oxygen patch, replace the extra "@@@"s.\r
+               str = str.replaceAll("@@@([^\\s]*)@@@", "");\r
                 CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
                 Reader reader = new StringReader(str);\r
                 TokenStream stream = analyzer.tokenStream("", reader);\r
@@ -162,23 +166,23 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 while (stream.incrementToken()) {\r
                     String term = termAtt.term();\r
                     tokens.add(term);\r
-                    WordAndScoring ws = new WordAndScoring(term, term, 1);
-                    boolean found = false;
-                               for (int i = 0; i < wsList.size(); i++) { 
-                                       // If the stem of the current word is already in list, 
-                                       // do not add the word in the list, just recompute scoring
-                                       if (wsList.get(i).getStem().equals(ws.getStem())) {
-                                               found = true;
-                                               int scoring = wsList.get(i).getScoring();
-                                               wsList.get(i).setScoring(scoring + ws.getScoring());
-                                               break;
+                    WordAndScoring ws = new WordAndScoring(term, term, 1);\r
+                    boolean found = false;\r
+                    for (WordAndScoring aWsList : wsList) {\r
+                        // If the stem of the current word is already in list,\r
+                        // do not add the word in the list, just recompute scoring\r
+                        if (aWsList.getStem().equals(ws.getStem())) {\r
+                            found = true;\r
+                            int scoring = aWsList.getScoring();\r
+                            aWsList.setScoring(scoring + ws.getScoring());\r
+                            break;\r
+                        }\r
+\r
+                    }\r
+                               if (!found) {\r
+                                       wsList.add(ws);\r
+                               }\r
                 }\r
-\r
-                                       }
-                               if (!found) {
-                                       wsList.add(ws);
-                               }
-                }
                 // START OXYGEN PATCH\r
                 //tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
                 // END OXYGEN PATCH\r
@@ -199,7 +203,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{
             } else if (indexerLanguage.equalsIgnoreCase("fr")){\r
                 stemmer= new FrenchStemmer();\r
             } else {\r
-                stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.\r
+                stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used.\r
             }\r
             // START OXYGEN PATCH\r
             wsList = new ArrayList<WordAndScoring>();\r
@@ -210,16 +214,16 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                        WordAndScoring ws = getWordAndScoring(token, stemmer, stem);\r
                        if (ws != null) {\r
                                boolean found = false;\r
-                               for (int i = 0; i < wsList.size(); i++) { \r
-                                       // If the stem of the current word is already in list, \r
-                                       // do not add the word in the list, just recompute scoring\r
-                                       if (wsList.get(i).getStem().equals(ws.getStem())) {\r
-                                               found = true;\r
-                                               int scoring = wsList.get(i).getScoring();\r
-                                               wsList.get(i).setScoring(scoring + ws.getScoring());\r
-                                               break;\r
-                                       }\r
-                                       }\r
+                    for (WordAndScoring aWsList : wsList) {\r
+                        // If the stem of the current word is already in list,\r
+                        // do not add the word in the list, just recompute scoring\r
+                        if (aWsList.getStem().equals(ws.getStem())) {\r
+                            found = true;\r
+                            int scoring = aWsList.getScoring();\r
+                            aWsList.setScoring(scoring + ws.getScoring());\r
+                            break;\r
+                        }\r
+                    }\r
                                if (!found) {\r
                                        wsList.add(ws);\r
                                }\r
@@ -256,10 +260,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                        ;\r
                        //System.out.println("temp="+s+"="+temp);\r
                        tempDico.put(s.getStem(), temp);\r
-               }else {\r
-                       String temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
-                       tempDico.put(s.getStem(), temp);\r
-               }\r
+               }else if (s != null) {\r
+                    String temp = null;\r
+                    temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
+                    tempDico.put(s.getStem(), temp);\r
+            }\r
                // END OXYGEN PATCH\r
         }\r
 \r
@@ -301,10 +306,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                                        scoring = SCORING_FOR_ITALIC;\r
                                } else if ("strong".equalsIgnoreCase(elementName)) {\r
                                        scoring = SCORING_FOR_BOLD;\r
-                               } else if ("meta_keywords".equalsIgnoreCase(elementName)) {
-                                       scoring = SCORING_FOR_KEYWORD;
-                               } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {
-                                       scoring = SCORING_FOR_INDEXTERM;
+                               } else if ("meta_keywords".equalsIgnoreCase(elementName)) {\r
+                                       scoring = SCORING_FOR_KEYWORD;\r
+                               } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {\r
+                                       scoring = SCORING_FOR_INDEXTERM;\r
                                }\r
                                // Get the stemmed word\r
                                String stemWord = word;\r
@@ -363,17 +368,15 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                }else {\r
                        // Clean-up using the props files\r
                        tempStrBuf.append("\\ba\\b");\r
-                       Iterator it = cleanUpList.iterator();\r
-                       while (it.hasNext()){\r
-                               tempStrBuf.append("|\\b").append(it.next()).append("\\b");\r
-                       }\r
+            for (String aCleanUp : cleanUpList) {\r
+                tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b");\r
+            }\r
                }\r
                if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){\r
                        tempCharBuf.append("\\u3002");\r
-                       Iterator it = cleanUpPunctuation.iterator();\r
-                       while (it.hasNext()){\r
-                               tempCharBuf.append("|"+it.next());
-                       }\r
+            for (String aCleanUpPunctuation : cleanUpPunctuation) {\r
+                tempCharBuf.append("|").append(aCleanUpPunctuation);\r
+            }\r
                }\r
 \r
                str = minimalClean(str, tempStrBuf, tempCharBuf);\r
index 44f67041e4be9a15759dfe2ec14fdfdbbeb6bf41..5c487e9f3c454d95b6170df5de2aae1aeeeed8e5 100755 (executable)
@@ -16,12 +16,12 @@ public class BlankRemover
         return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", "");\r
     }\r
 \r
-    /* replace multiple whitespaces between words with single blank */\r
+    /* replace multiple whitespace between words with single blank */\r
     public static String itrim(String source) {\r
         return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");\r
     }\r
 \r
-    /* remove all superfluous whitespaces in source string */\r
+    /* remove all superfluous whitespace in source string */\r
     public static String rmWhiteSpace(String source) {\r
                //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");\r
         return (source==null)? null : itrim(ltrim(rtrim(source)));\r
index 21538404c9dd9c87cee1aa43a67a97de87ef91c4..e24cda8e020183b4ec39558560d790441d3acda3 100755 (executable)
@@ -13,34 +13,34 @@ public class DirList {
        String [] topicFiles = null;
        public static final int MAX_DEPTH = 10;
     
-  public DirList(File inputdir, String regex, int depth) {
+  public DirList(File inputDir, String regexp, int depth) {
     try {
       
       listFiles = new ArrayList<File> ();
        
     // not yet implemented     
-      if(regex == null) {
-          for (File f: inputdir.listFiles()) {
+      if(regexp == null) {
+          for (File f: inputDir.listFiles()) {
                  if (!f.isDirectory()){
                          listFiles.add(f);
                  }else {
                          if (depth < MAX_DEPTH ) {
-                               DirList nsiDoc = new DirList(f,regex,depth+1);
+                               DirList nsiDoc = new DirList(f,regexp,depth+1);
                                listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
                          }
                  }
           }
       }
       else {
-          for (File f: inputdir.listFiles(new DirFilter(regex))) {
+          for (File f: inputDir.listFiles(new DirFilter(regexp))) {
                  listFiles.add(f);
           }
 // Patch from Oxygen to address problem where directories
 // containing . were not traversed.
-          for (File f: inputdir.listFiles(new DirFilter(".*"))) {
+          for (File f: inputDir.listFiles(new DirFilter(".*"))) {
                  if (f.isDirectory()){
                          if (depth < MAX_DEPTH ) {
-                               DirList nsiDoc = new DirList(f,regex, depth+1);
+                               DirList nsiDoc = new DirList(f,regexp, depth+1);
                                listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
                          }
                  }