]> granicus.if.org Git - docbook-dsssl/commitdiff
Proper formats for java files before start implementing the new features
authorKasun Gajasinghe <kasunbg@gmail.com>
Fri, 9 Sep 2011 18:48:56 +0000 (18:48 +0000)
committerKasun Gajasinghe <kasunbg@gmail.com>
Fri, 9 Sep 2011 18:48:56 +0000 (18:48 +0000)
xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerConstants.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/TesterIndexer.java
xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java
xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java
xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java
xsl-webhelpindexer/src/com/nexwave/nsidita/DocFileInfo.java

index 407c5b5859c2877103b78029d03ef7895f6c9b91..aeb736bd0f7abb6c4fe4fbf44c75f1389dbc0f45 100755 (executable)
@@ -2,15 +2,14 @@ package com.nexwave.nquindexer;
 \r
 /**\r
  * Constants used for the indexer.\r
- * \r
- * @version 2.0 2008-02-26\r
- * \r
+ *\r
  * @author N. Quaine\r
+ * @version 2.0 2008-02-26\r
  */\r
 public abstract class IndexerConstants {\r
     // European punctuation\r
-       public static final String EUPUNCTUATION1 = "[$|%,;'()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+";
-       public static final String EUPUNCTUATION2 = "[$,;'()\\/*\"{}=!&+<>\\\\]";       
+    public static final String EUPUNCTUATION1 = "[$|%,;'()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+";\r
+    public static final String EUPUNCTUATION2 = "[$,;'()\\/*\"{}=!&+<>\\\\]";\r
     // Japanese punctuation\r
     public static final String JPPUNCTUATION1 = "\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D";\r
     public static final String JPPUNCTUATION2 = "\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F";\r
index 63cbe9c935d4b85f78ec1bcabfd445b92353456d..0f16a865d596bb5c0170abf6d3769241baf12b38 100644 (file)
@@ -10,8 +10,10 @@ import java.util.*;
 
 /**
  * Main class of Stand-alone version of WebHelpIndexer
+ *
  * User: Kasun Gajasinghe, University of Moratuwa, http://kasunbg.blogspot.com
  * Date: Feb 10, 2011
+ *
  * @author Kasun Gajasinghe
  */
 
@@ -26,8 +28,8 @@ public class IndexerMain {
     private String txt_no_words_gathered = "No words have been indexed in";
     private String txt_no_html_files = "No HTML Files found in";
     private String txt_no_args = "No argument given: you must provide an htmlDir to the IndexerMain";
-    
-    private static String txt_no_lang_specified ="Language of the content is not specified. Defaults to English.";
+
+    private static String txt_no_lang_specified = "Language of the content is not specified. Defaults to English.";
 
     //working directories
     private String searchdir = "search";
@@ -51,13 +53,14 @@ public class IndexerMain {
     //Html extension
     private String htmlExtension = "html";
 
-       // OXYGEN PATCH START
-       //Table of contents file name
-       private String tocfile;
-       private boolean stem;
-       // OXYGEN PATCH END
+    // OXYGEN PATCH START
+    //Table of contents file name
+    private String tocfile;
+    private boolean stem;
+    // OXYGEN PATCH END
 
     // Constructors
+
     public IndexerMain(String htmlDir, String indexerLanguage) {
         super();
         setHtmlDir(htmlDir);
@@ -65,7 +68,8 @@ public class IndexerMain {
     }
 
     /**
-     * The content language defaults to English "en" 
+     * The content language defaults to English "en"
+     *
      * @param htmlDir The directory where html files reside.
      */
     public IndexerMain(String htmlDir) {
@@ -142,13 +146,13 @@ public class IndexerMain {
             indexer = new IndexerMain(args[0]);
         } else if (args.length >= 2) {
 
-            indexer = new IndexerMain(args[0], args[1]);                        
-        } else { 
-            throw new  RuntimeException("Please specify the parameters htmlDirectory and " +
-                    "indexerLanguage (optional). \n "+
+            indexer = new IndexerMain(args[0], args[1]);
+        } else {
+            throw new RuntimeException("Please specify the parameters htmlDirectory and " +
+                    "indexerLanguage (optional). \n " +
                     "ex: java -jar webhelpindexer.jar docs/content en \n" +
                     "The program will exit now."
-                    );
+            );
         }
 
         indexer.execute();
@@ -242,15 +246,15 @@ public class IndexerMain {
         // Get the list of all html files with relative paths
         htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir);
 
-               // OXYGEN PATCH START. 
-               // Remove the table of content file 
-               Iterator<String> iterator = htmlFilesPathRel.iterator();
-               while (iterator.hasNext()) {
-                       if (iterator.next().endsWith(tocfile + "." + htmlExtension)) {
-                               iterator.remove();
-                       }
-               }
-               // OXYGEN PATCH END
+        // OXYGEN PATCH START.
+        // Remove the table of content file
+        Iterator<String> iterator = htmlFilesPathRel.iterator();
+        while (iterator.hasNext()) {
+            if (iterator.next().endsWith(tocfile + "." + htmlExtension)) {
+                iterator.remove();
+            }
+        }
+        // OXYGEN PATCH END
         if (htmlFiles == null) {
             System.out.println(txt_no_files_found);
             return;
@@ -260,7 +264,7 @@ public class IndexerMain {
         }
 
         // Create the list of the existing html files (index starts at 0)
-               WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel, stem);
+        WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel, stem);
 
         // Parse each html file to retrieve the words:
         // ------------------------------------------
@@ -284,34 +288,34 @@ public class IndexerMain {
             // parse each html files
             while (it.hasNext()) {
                 File ftemp = (File) it.next();
-                               // OXYGEN PATCH START. Remove table of content file
-                               if (!ftemp.getAbsolutePath().endsWith(tocfile + "." + htmlExtension)) {
-                               // OXYGEN PATCH END
-                //tempMap.put(key, value);
-                //The HTML file information are added in the list of FileInfoObject
-                                       DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage, stem));
-
-                ftemp = docFileInfoTemp.getFullpath();
-                String stemp = ftemp.toString();
-                int i = stemp.indexOf(projectDir);
-                if (i != 0) {
-                    System.out.println("the documentation root does not match with the documentation input!");
-                    return;
+                // OXYGEN PATCH START. Remove table of content file
+                if (!ftemp.getAbsolutePath().endsWith(tocfile + "." + htmlExtension)) {
+                    // OXYGEN PATCH END
+                    //tempMap.put(key, value);
+                    //The HTML file information are added in the list of FileInfoObject
+                    DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp, indexerLanguage, stem));
+
+                    ftemp = docFileInfoTemp.getFullpath();
+                    String stemp = ftemp.toString();
+                    int i = stemp.indexOf(projectDir);
+                    if (i != 0) {
+                        System.out.println("the documentation root does not match with the documentation input!");
+                        return;
+                    }
+                    int ad = 1;
+                    if (stemp.equals(projectDir)) ad = 0;
+                    stemp = stemp.substring(i + projectDir.length() + ad);  //i is redundant (i==0 always)
+                    ftemp = new File(stemp);
+                    docFileInfoTemp.setFullpath(ftemp);
+
+                    filesDescription.add(docFileInfoTemp);
+                    // OXYGEN PATCH START
+                    // Remove the table of content file
+                } else {
+                    it.remove();
                 }
-                int ad = 1;
-                if (stemp.equals(projectDir)) ad = 0;
-                stemp = stemp.substring(i + projectDir.length() + ad);  //i is redundant (i==0 always)
-                ftemp = new File(stemp);
-                docFileInfoTemp.setFullpath(ftemp);
-
-                filesDescription.add(docFileInfoTemp);
-                               // OXYGEN PATCH START
-                               // Remove the table of content file
-                               } else {
-                                       it.remove();
+                // OXYGEN PATCH END
             }
-                               // OXYGEN PATCH END                                     
-                       }
             /*remove empty strings from the map*/
             if (tempDico.containsKey("")) {
                 tempDico.remove("");
@@ -414,14 +418,16 @@ public class IndexerMain {
         return 0;
     }
 
-       // OXYGEN PATCH START
-       // Set table of content file
+    // OXYGEN PATCH START
+    // Set table of content file
+
     public void setTocfile(String tocfile) {
-       this.tocfile = tocfile;
+        this.tocfile = tocfile;
     }
     // If true then generate js files with stemming words
+
     public void setStem(boolean stem) {
-       this.stem = stem;
+        this.stem = stem;
     }
-       // OXYGEN PATCH END
+    // OXYGEN PATCH END
 }
index a24cc18555829a64ef97fa3f9e186721e8bae6f8..a415d268e9113a5fd337501b24fda8ac548c2f4e 100755 (executable)
@@ -14,164 +14,166 @@ import org.xml.sax.SAXParseException;
 /**\r
  * Generic parser for populating a DocFileInfo object.\r
  *\r
- * @version 2.0 2010-08-14\r
- *\r
  * @author N. Quaine\r
  * @author Kasun Gajasinghe\r
+ * @version 2.0 2010-08-14\r
  */\r
 public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {\r
 \r
-       //members\r
-       protected DocFileInfo fileDesc = null;\r
-       protected String projectDir = null;\r
-       protected StringBuffer strbf = null;\r
-       private String currentElName = "";\r
-       private StringBuffer tempVal = null;\r
-       private boolean shortdescBool = false;\r
-       private int shortTagCpt = 0;\r
-\r
-       // OXYGEN PATCH. Keep the stack of elements\r
-       Stack<String> stack = new Stack<String>();\r
-       //methods\r
-       /**\r
-        * Constructor\r
-        */\r
-       public SaxDocFileParser () {\r
-\r
-       }\r
-\r
-       /**\r
-        * Initializer\r
-        */\r
-       public int init(String inputDir){\r
-               return 0;\r
-       }\r
-\r
-       /**\r
-        * Parses the file to extract all the words for indexing and\r
-        * some data characterizing the file.\r
-        * @param file contains the fullpath of the document to parse\r
-        * @return a DitaFileInfo object filled with data describing the file\r
-        */\r
-       public DocFileInfo runExtractData(File file) {\r
-               //initialization\r
-               fileDesc = new DocFileInfo(file);\r
-               strbf = new StringBuffer("");\r
-\r
-               // Fill strbf by parsing the file\r
-               parseDocument(file);\r
-\r
-               return fileDesc;\r
-       }\r
-\r
-       public void parseDocument (File file) {\r
+    //members\r
+    protected DocFileInfo fileDesc = null;\r
+    protected String projectDir = null;\r
+    protected StringBuffer strbf = null;\r
+    private String currentElName = "";\r
+    private StringBuffer tempVal = null;\r
+    private boolean shortdescBool = false;\r
+    private int shortTagCpt = 0;\r
+\r
+    // OXYGEN PATCH. Keep the stack of elements\r
+    Stack<String> stack = new Stack<String>();\r
+    //methods\r
+\r
+    /**\r
+     * Constructor\r
+     */\r
+    public SaxDocFileParser() {\r
+\r
+    }\r
+\r
+    /**\r
+     * Initializer\r
+     */\r
+    public int init(String inputDir) {\r
+        return 0;\r
+    }\r
+\r
+    /**\r
+     * Parses the file to extract all the words for indexing and\r
+     * some data characterizing the file.\r
+     *\r
+     * @param file contains the fullpath of the document to parse\r
+     * @return a DitaFileInfo object filled with data describing the file\r
+     */\r
+    public DocFileInfo runExtractData(File file) {\r
+        //initialization\r
+        fileDesc = new DocFileInfo(file);\r
+        strbf = new StringBuffer("");\r
+\r
+        // Fill strbf by parsing the file\r
+        parseDocument(file);\r
+\r
+        return fileDesc;\r
+    }\r
+\r
+    public void parseDocument(File file) {\r
 //        System.out.println(System.getProperty("org.xml.sax.driver"));\r
 //        System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory"));\r
 \r
-               //get a factory\r
-               javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();\r
+        //get a factory\r
+        javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();\r
 \r
-               spf.setValidating(false);\r
+        spf.setValidating(false);\r
         addContent = false;\r
-               divCount = 0;\r
-               try {\r
-                       //get a new instance of parser\r
-                       javax.xml.parsers.SAXParser sp = spf.newSAXParser();\r
-                       // deactivate the validation\r
-                       sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false);\r
-                       sp.getXMLReader().setFeature( "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);\r
+        divCount = 0;\r
+        try {\r
+            //get a new instance of parser\r
+            javax.xml.parsers.SAXParser sp = spf.newSAXParser();\r
+            // deactivate the validation\r
+            sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false);\r
+            sp.getXMLReader().setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);\r
 \r
             //parse the file and also register this class for call backs\r
-                       //System.out.println("Parsing: " + file);\r
+            //System.out.println("Parsing: " + file);\r
 \r
-                       long start = System.currentTimeMillis();\r
-                       //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
+            long start = System.currentTimeMillis();\r
+            //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
 \r
-                       String content = RemoveValidationPI (file);\r
-                       if (content != null){\r
-                               InputSource is = new InputSource(new StringReader(content));\r
-                               is.setSystemId(file.toURI().toURL().toString());\r
-                           sp.parse(is, this);\r
-                       }\r
+            String content = RemoveValidationPI(file);\r
+            if (content != null) {\r
+                InputSource is = new InputSource(new StringReader(content));\r
+                is.setSystemId(file.toURI().toURL().toString());\r
+                sp.parse(is, this);\r
+            }\r
 \r
-                       long finish = System.currentTimeMillis();\r
-                       //System.out.println("done parsing " + file.getName() + " >>> " + finish);\r
-                       //System.out.println("time = " + (finish - start) + " milliseconds");\r
+            long finish = System.currentTimeMillis();\r
+            //System.out.println("done parsing " + file.getName() + " >>> " + finish);\r
+            //System.out.println("time = " + (finish - start) + " milliseconds");\r
 \r
-               }catch(SAXParseException spe){\r
+        } catch (SAXParseException spe) {\r
             System.out.println("SaxParseException: The indexing file contains incorrect xml syntax.");\r
             spe.printStackTrace();\r
-        }catch(org.xml.sax.SAXException se) {\r
-                       System.out.println("SaxException. You may need to include Xerces in your classpath. " +\r
+        } catch (org.xml.sax.SAXException se) {\r
+            System.out.println("SaxException. You may need to include Xerces in your classpath. " +\r
                     "See documentation for details");\r
-                       se.printStackTrace();\r
-               }catch(javax.xml.parsers.ParserConfigurationException pce) {\r
-                       pce.printStackTrace();\r
-               }catch (IOException ie) {\r
-                       ie.printStackTrace();\r
-               }\r
-       }\r
+            se.printStackTrace();\r
+        } catch (javax.xml.parsers.ParserConfigurationException pce) {\r
+            pce.printStackTrace();\r
+        catch (IOException ie) {\r
+            ie.printStackTrace();\r
+        }\r
+    }\r
 \r
     private boolean addContent = false;\r
     private boolean addHeaderInfo = false;\r
-    private boolean doNotIndex=false;\r
+    private boolean doNotIndex = false;\r
     private int divCount = 0;\r
-       //SAX parser Event Handlers:\r
-       public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException {\r
+    //SAX parser Event Handlers:\r
+\r
+    public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException {\r
 \r
-               //dwc: capture current element name\r
-               // START OXYGEN PATCH, add current element in stack\r
-               stack.add(qName);\r
-               // END OXYGEN PATCH\r
-               currentElName = qName;\r
+        //dwc: capture current element name\r
+        // START OXYGEN PATCH, add current element in stack\r
+        stack.add(qName);\r
+        // END OXYGEN PATCH\r
+        currentElName = qName;\r
 \r
-               // dwc: Adding contents of some meta tags to the index\r
-               if((qName.equalsIgnoreCase("meta")) ) {\r
+        // dwc: Adding contents of some meta tags to the index\r
+        if ((qName.equalsIgnoreCase("meta"))) {\r
             addHeaderInfo = true;\r
-                       String attrName = attributes.getValue("name");\r
-                       // OXYGEN PATCH START EXM-20576 - add scoring for keywords\r
-                       if(attrName != null && (attrName.equalsIgnoreCase("keywords")\r
-                               || attrName.equalsIgnoreCase("description")\r
-                               || attrName.equalsIgnoreCase("indexterms")\r
-                               )){\r
-                           if (attrName.equalsIgnoreCase("keywords")) {\r
-                               String[] keywords = attributes.getValue("content").split(", ");\r
+            String attrName = attributes.getValue("name");\r
+            // OXYGEN PATCH START EXM-20576 - add scoring for keywords\r
+            if (attrName != null && (attrName.equalsIgnoreCase("keywords")\r
+                    || attrName.equalsIgnoreCase("description")\r
+                    || attrName.equalsIgnoreCase("indexterms")\r
+            )) {\r
+                if (attrName.equalsIgnoreCase("keywords")) {\r
+                    String[] keywords = attributes.getValue("content").split(", ");\r
                     for (String keyword : keywords) {\r
                         strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ ");\r
                     }\r
-                           } else if (attrName.equalsIgnoreCase("indexterms")) {\r
-                               String[] indexterms = attributes.getValue("content").split(", ");\r
+                } else if (attrName.equalsIgnoreCase("indexterms")) {\r
+                    String[] indexterms = attributes.getValue("content").split(", ");\r
                     for (String indexterm : indexterms) {\r
                         strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ ");\r
                     }\r
-                           } else {\r
-                               strbf.append(" ").append(attributes.getValue("content") ).append(" ");\r
-                           }\r
-                       }\r
-                       // OXYGEN PATCH END EXM-20576 - add scoring for indexterms\r
-                       // dwc: adding this to make the docbook <abstract> element\r
-                       // (which becomes <meta name="description".../> in html)\r
-                       // into the brief description that shows up in search\r
-                       // results.\r
-                       if(attrName != null && (attrName.equalsIgnoreCase("description"))){\r
-                               fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' ')));\r
-                       }\r
-               } // dwc: End addition\r
+                } else {\r
+                    strbf.append(" ").append(attributes.getValue("content")).append(" ");\r
+                }\r
+            }\r
+            // OXYGEN PATCH END EXM-20576 - add scoring for indexterms\r
+            // dwc: adding this to make the docbook <abstract> element\r
+            // (which becomes <meta name="description".../> in html)\r
+            // into the brief description that shows up in search\r
+            // results.\r
+            if (attrName != null && (attrName.equalsIgnoreCase("description"))) {\r
+                fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' ')));\r
+            }\r
+        } // dwc: End addition\r
 \r
         // dwc: commenting out DITA specific lines\r
-               if((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) {\r
-                       tempVal = new StringBuffer();\r
-               }\r
+        if ((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) {\r
+            tempVal = new StringBuffer();\r
+        }\r
 \r
         addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc");\r
 \r
         String elementId = attributes.getValue("id");\r
-        if("content".equals(elementId)) addContent = true;\r
+        if ("content".equals(elementId)) addContent = true;\r
 \r
-        if(addContent) {\r
+        if (addContent) {\r
             //counts div tags starting from "content" div(inclusive). This will be used to track the end of content "div" tag.\r
             //see #endElement()\r
-            if(qName.equalsIgnoreCase("div")){\r
+            if (qName.equalsIgnoreCase("div")) {\r
                 divCount++;\r
             }\r
 \r
@@ -191,74 +193,78 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
             String accessKey = attributes.getValue("accesskey");\r
             doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey));\r
         }\r
-               strbf.append(" ");\r
-       }\r
+        strbf.append(" ");\r
+    }\r
 \r
-       //triggers when there's character data inside an element.\r
-       public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
+    //triggers when there's character data inside an element.\r
 \r
-               // index certain elements. E.g. Use this to implement a\r
-               // "titles only" index,\r
+    public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
+\r
+        // index certain elements. E.g. Use this to implement a\r
+        // "titles only" index,\r
 \r
         //OXYGEN PATCH, gather more keywords.\r
-               if(\r
+        if (\r
 //                             (addContent || addHeaderInfo) && \r
-                               !doNotIndex && !currentElName.equalsIgnoreCase("script")){\r
-                       String text = new String(ch,start,length);\r
-                       // START OXYGEN PATCH, append a marker after each word\r
-                       // The marker is used to compute the scoring\r
-                       // Create the marker\r
-                       String originalText = text.replaceAll("\\s+"," ");\r
-                       text = text.trim();\r
-                       // Do a minimal clean\r
-                       text = minimalClean(text, null, null);\r
-                       text = text.replaceAll("\\s+"," ");\r
-                       String marker = "@@@elem_" + stack.peek() + "@@@ ";\r
-                       Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);\r
-                       if (text.trim().length() > 0 && m.find()) {\r
-                           String copyText = new String(originalText);\r
-                           text = duplicateWords(copyText, text, "-");\r
-                           copyText = new String(originalText);\r
-                           text = duplicateWords(copyText, text, ":");\r
-                           copyText = new String(originalText);\r
-                           text = duplicateWords(copyText, text, ".");\r
-                               // Replace whitespace with the marker\r
-                               text = text.replace(" ", marker);\r
-                               text = text + marker;\r
-                       }\r
-                       // END OXYGEN PATCH\r
-                       strbf.append(text);\r
+                !doNotIndex && !currentElName.equalsIgnoreCase("script")) {\r
+            String text = new String(ch, start, length);\r
+            // START OXYGEN PATCH, append a marker after each word\r
+            // The marker is used to compute the scoring\r
+            // Create the marker\r
+            String originalText = text.replaceAll("\\s+", " ");\r
+            text = text.trim();\r
+            // Do a minimal clean\r
+            text = minimalClean(text, null, null);\r
+            text = text.replaceAll("\\s+", " ");\r
+            String marker = "@@@elem_" + stack.peek() + "@@@ ";\r
+            Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);\r
+            if (text.trim().length() > 0 && m.find()) {\r
+                String copyText = new String(originalText);\r
+                text = duplicateWords(copyText, text, "-");\r
+                copyText = new String(originalText);\r
+                text = duplicateWords(copyText, text, ":");\r
+                copyText = new String(originalText);\r
+                text = duplicateWords(copyText, text, ".");\r
+                // Replace whitespace with the marker\r
+                text = text.replace(" ", marker);\r
+                text = text + marker;\r
+            }\r
+            // END OXYGEN PATCH\r
+            strbf.append(text);\r
 //                     System.out.println("=== marked text: " + text);\r
-                       // START OXYGEN PATCH, append the original text\r
-                       if (tempVal != null) { tempVal.append(originalText);}\r
-                       // END OXYGEN PATCH\r
-               }\r
-       }\r
-\r
-       // START OXYGEN PATCH EXM-20414\r
-       private String duplicateWords(String sourceText, String acumulator, String separator) {\r
+            // START OXYGEN PATCH, append the original text\r
+            if (tempVal != null) {\r
+                tempVal.append(originalText);\r
+            }\r
+            // END OXYGEN PATCH\r
+        }\r
+    }\r
+\r
+    // START OXYGEN PATCH EXM-20414\r
+\r
+    private String duplicateWords(String sourceText, String acumulator, String separator) {\r
 //         System.out.println("sourceText: " + sourceText + "   separator: " + separator);\r
-           int index = sourceText.indexOf(separator);\r
-           while (index >= 0) {\r
-               int indexSpaceAfter = sourceText.indexOf(" ", index);\r
-               String substring = null;\r
-               if (indexSpaceAfter >= 0) {\r
-                   substring = sourceText.substring(0, indexSpaceAfter);\r
-                   sourceText = sourceText.substring(indexSpaceAfter);\r
-               } else {\r
-                   substring = sourceText;\r
-                   sourceText = "";\r
-               }\r
-\r
-               int indexSpaceBefore = substring.lastIndexOf(" ");\r
-               if (indexSpaceBefore >= 0) {\r
-                   substring = substring.substring(indexSpaceBefore + 1);\r
-               }\r
-               if (separator.indexOf(".") >= 0) {\r
-                   separator = separator.replaceAll("\\.", "\\\\.");\r
+        int index = sourceText.indexOf(separator);\r
+        while (index >= 0) {\r
+            int indexSpaceAfter = sourceText.indexOf(" ", index);\r
+            String substring = null;\r
+            if (indexSpaceAfter >= 0) {\r
+                substring = sourceText.substring(0, indexSpaceAfter);\r
+                sourceText = sourceText.substring(indexSpaceAfter);\r
+            } else {\r
+                substring = sourceText;\r
+                sourceText = "";\r
+            }\r
+\r
+            int indexSpaceBefore = substring.lastIndexOf(" ");\r
+            if (indexSpaceBefore >= 0) {\r
+                substring = substring.substring(indexSpaceBefore + 1);\r
+            }\r
+            if (separator.indexOf(".") >= 0) {\r
+                separator = separator.replaceAll("\\.", "\\\\.");\r
 //                 System.out.println("++++++++++ separator: " + separator);\r
-               }\r
-               String[] tokens = substring.split(separator);\r
+            }\r
+            String[] tokens = substring.split(separator);\r
 \r
             for (String token : tokens) {\r
                 acumulator = acumulator + " " + token;\r
@@ -266,169 +272,167 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
             }\r
 \r
             index = sourceText.indexOf(separator);\r
-           }\r
-\r
-           return acumulator;\r
-       }\r
-       // END OXYGEN PATCH EXM-20414\r
-       public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {\r
-               // START OXYGEN PATCH, remove element from stack\r
-               stack.pop();\r
-               // END OXYGEN PATCH\r
-               if(qName.equalsIgnoreCase("title")) {\r
-                       //add it to the list\r
-                       //myEmpls.add(tempEmp);\r
-                       fileDesc.setTitle(BlankRemover.rmWhiteSpace(tempVal.toString()));\r
-                       tempVal = null;\r
-               }\r
-               else if (shortdescBool) {\r
-                       shortTagCpt --;\r
-                       if (shortTagCpt == 0) {\r
-                               String shortdesc = tempVal.toString().replace('\n', ' ');\r
-                               if(shortdesc.trim().length() > 0) {\r
-                                       fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));\r
-                               }\r
-                       tempVal = null;\r
-                       shortdescBool = false;\r
-                       }\r
-               }\r
-\r
-        if(qName.equalsIgnoreCase("div") && addContent){\r
+        }\r
+\r
+        return acumulator;\r
+    }\r
+    // END OXYGEN PATCH EXM-20414\r
+\r
+    public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {\r
+        // START OXYGEN PATCH, remove element from stack\r
+        stack.pop();\r
+        // END OXYGEN PATCH\r
+        if (qName.equalsIgnoreCase("title")) {\r
+            //add it to the list\r
+            //myEmpls.add(tempEmp);\r
+            fileDesc.setTitle(BlankRemover.rmWhiteSpace(tempVal.toString()));\r
+            tempVal = null;\r
+        } else if (shortdescBool) {\r
+            shortTagCpt--;\r
+            if (shortTagCpt == 0) {\r
+                String shortdesc = tempVal.toString().replace('\n', ' ');\r
+                if (shortdesc.trim().length() > 0) {\r
+                    fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));\r
+                }\r
+                tempVal = null;\r
+                shortdescBool = false;\r
+            }\r
+        }\r
+\r
+        if (qName.equalsIgnoreCase("div") && addContent) {\r
             divCount--;\r
             if (divCount == 0) {\r
                 addContent = false;\r
             }\r
         }\r
-       }\r
+    }\r
+\r
+    public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {\r
+        //do nothing\r
 \r
-       public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {\r
-               //do nothing\r
+    }\r
 \r
-       }\r
+    /*public InputSource resolveEntity(String publicId, String systemId)\r
+         throws IOException, SAXException {\r
 \r
-       /*public InputSource resolveEntity(String publicId, String systemId)\r
-               throws IOException, SAXException {\r
+         // use the catalog to solve the doctype\r
+         System.out.println("entities " + publicId + systemId);\r
+         return null;\r
+     }*/\r
 \r
-               // use the catalog to solve the doctype\r
-               System.out.println("entities " + publicId + systemId);\r
-               return null;\r
-       }*/\r
-       public org.xml.sax.InputSource resolveEntity(String publicId, String systemId)\r
-       throws org.xml.sax.SAXException, IOException {\r
-               //System.out.println("Entities " + publicId + "and" + systemId);\r
-               // use dita ot (dost.jar) for resolving dtd paths using the calatog\r
+    public org.xml.sax.InputSource resolveEntity(String publicId, String systemId)\r
+            throws org.xml.sax.SAXException, IOException {\r
+        //System.out.println("Entities " + publicId + "and" + systemId);\r
+        // use dita ot (dost.jar) for resolving dtd paths using the calatog\r
 \r
-       return null;\r
-       }\r
+        return null;\r
+    }\r
 \r
     /**\r
      * Removes the validation in html files, such as xml version and DTDs\r
+     *\r
      * @param file the html file\r
      * @return int: returns 0 if no IOException occurs, else 1.\r
      */\r
-       public String RemoveValidationPI (File file) {\r
+    public String RemoveValidationPI(File file) {\r
         StringBuilder sb = new StringBuilder();\r
-         //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
-         //  be the source to index the content of the particular html page.\r
-               try {\r
-                       BufferedReader br = new BufferedReader(\r
-                       new InputStreamReader(\r
-                        new FileInputStream(file),"UTF-8"));\r
-\r
-                       while(true)\r
-                       {\r
-                               int i1, i2;\r
-                               boolean ok = true;\r
-                               try {\r
-\r
-                                       String line = br.readLine();\r
-\r
-                                       if (line == null) {\r
-                                               break;\r
-                                       }\r
-                                       //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");\r
-\r
-                                       line = line.replaceAll("\\x26nbsp\\x3B", "&#160;");\r
-\r
-                                       if (!line.contains("<!DOCTYPE html PUBLIC")) {\r
-                                               //dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.\r
-                                               if (line.contains("<?xml version")) {\r
-                                                       line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
-                                               }\r
+        //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
+        //  be the source to index the content of the particular html page.\r
+        try {\r
+            BufferedReader br = new BufferedReader(\r
+                    new InputStreamReader(\r
+                            new FileInputStream(file), "UTF-8"));\r
+\r
+            while (true) {\r
+                int i1, i2;\r
+                boolean ok = true;\r
+                try {\r
+\r
+                    String line = br.readLine();\r
+\r
+                    if (line == null) {\r
+                        break;\r
+                    }\r
+                    //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");\r
+\r
+                    line = line.replaceAll("\\x26nbsp\\x3B", "&#160;");\r
+\r
+                    if (!line.contains("<!DOCTYPE html PUBLIC")) {\r
+                        //dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.\r
+                        if (line.contains("<?xml version")) {\r
+                            line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E", "\n");\r
+                        }\r
 \r
                         sb.append(line).append("\n");\r
-                                       } else\r
-                                       {\r
-                                               //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
-                                               i1 = line.indexOf("<!DOCTYPE");\r
-                                               i2 = line.indexOf(">", i1);\r
-                                               while (i2 < 0) {\r
-\r
-                                                       line = line.concat(br.readLine());\r
-                                                       i2 = line.indexOf(">", i1);\r
-                                               }\r
-                                               String temp = line.substring(i1, i2);\r
-\r
-                                               //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");\r
-                                               if (line.contains("<?xml version")) {\r
-                                                       line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
-                                               }\r
-                                               line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n");\r
+                    } else {\r
+                        //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
+                        i1 = line.indexOf("<!DOCTYPE");\r
+                        i2 = line.indexOf(">", i1);\r
+                        while (i2 < 0) {\r
+\r
+                            line = line.concat(br.readLine());\r
+                            i2 = line.indexOf(">", i1);\r
+                        }\r
+                        String temp = line.substring(i1, i2);\r
+\r
+                        //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");\r
+                        if (line.contains("<?xml version")) {\r
+                            line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E", "\n");\r
+                        }\r
+                        line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E", "\n");\r
 \r
                         sb.append(line);\r
-                                       }\r
-                               }\r
-                               catch (IOException e)\r
-                               {\r
-                                       break;\r
-                               }\r
-                       }\r
-\r
-                       br.close();\r
-               }\r
-               catch (IOException e)\r
-               {\r
-                       return null;\r
-               }\r
-\r
-               return sb.toString(); // return status\r
-\r
-       }\r
-\r
-       // START OXYGEN PATCH, moved from subclass\r
-       protected String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
-               String tempPunctuation = null;\r
-               if (tempCharBuf!= null) {\r
-                       tempPunctuation = new String(tempCharBuf);\r
-               }\r
-\r
-               str = str.replaceAll("\\s+", " ");\r
-               str = str.replaceAll("->", " ");\r
-               str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
-               str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
-               if (tempPunctuation != null && tempPunctuation.length() > 0)\r
-               {\r
-                       str = str.replaceAll(tempPunctuation, " ");\r
-               }\r
-\r
-               if (tempStrBuf != null) {\r
-                       //remove useless words\r
-                       str = str.replaceAll(tempStrBuf.toString(), " ");\r
-               }\r
-\r
-               // Redo punctuation after removing some words: (TODO: useful?)\r
-               str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
-               str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
-               str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
-               if (tempPunctuation != null && tempPunctuation.length() > 0)\r
-               {\r
-                       str = str.replaceAll(tempPunctuation, " ");\r
-               }               return str;\r
-       }\r
-       // END OXYGEN PATCH\r
+                    }\r
+                }\r
+                catch (IOException e) {\r
+                    break;\r
+                }\r
+            }\r
+\r
+            br.close();\r
+        }\r
+        catch (IOException e) {\r
+            return null;\r
+        }\r
+\r
+        return sb.toString(); // return status\r
+\r
+    }\r
+\r
+    // START OXYGEN PATCH, moved from subclass\r
+\r
+    protected String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
+        String tempPunctuation = null;\r
+        if (tempCharBuf != null) {\r
+            tempPunctuation = new String(tempCharBuf);\r
+        }\r
+\r
+        str = str.replaceAll("\\s+", " ");\r
+        str = str.replaceAll("->", " ");\r
+        str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
+        str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
+        str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
+        str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
+        str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
+        if (tempPunctuation != null && tempPunctuation.length() > 0) {\r
+            str = str.replaceAll(tempPunctuation, " ");\r
+        }\r
+\r
+        if (tempStrBuf != null) {\r
+            //remove useless words\r
+            str = str.replaceAll(tempStrBuf.toString(), " ");\r
+        }\r
+\r
+        // Redo punctuation after removing some words: (TODO: useful?)\r
+        str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
+        str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
+        str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
+        str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
+        str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
+        if (tempPunctuation != null && tempPunctuation.length() > 0) {\r
+            str = str.replaceAll(tempPunctuation, " ");\r
+        }\r
+        return str;\r
+    }\r
+    // END OXYGEN PATCH\r
 }\r
index c57bbf7cd2b258b388a8c2c2e54924425d5cc81a..ba436e6d55197c8c4d4a53a32ea8122a30675d6c 100755 (executable)
@@ -15,6 +15,7 @@ import com.nexwave.stemmer.snowball.ext.EnglishStemmer;
 import com.nexwave.stemmer.snowball.ext.FrenchStemmer;\r
 import com.nexwave.stemmer.snowball.ext.GermanStemmer;\r
 \r
+//client-side support is yet to come for these stemmers\r
 import com.nexwave.stemmer.snowball.ext.danishStemmer;\r
 import com.nexwave.stemmer.snowball.ext.dutchStemmer;\r
 import com.nexwave.stemmer.snowball.ext.finnishStemmer;\r
@@ -43,99 +44,106 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute;
  * NOTE: This indexes only the content under a tag with ID "content".\r
  * Wrap html content with a div tag with id "content" to index relevant parts of your page.\r
  *\r
- * @version 2.0 2010\r
- *\r
  * @author N. Quaine\r
  * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
+ * @version 2.0 2010\r
  */\r
-public class SaxHTMLIndex extends SaxDocFileParser{\r
+public class SaxHTMLIndex extends SaxDocFileParser {\r
 \r
     //KasunBG: apparently tempDico stores all the keywords and a pointer to the files containing the index in a Map\r
     //example: ("keyword1", "0,2,4"), ("docbook", "1,2,5") \r
-       private Map<String,String> tempDico;\r
-       private int i = 0;\r
-       private ArrayList <String> cleanUpList = null;\r
-       private ArrayList <String> cleanUpPunctuation = null;\r
-\r
-       // START OXYGEN PATCH, scoring for HTML elements\r
-       private int SCORING_FOR_H1 = 50;\r
-       private int SCORING_FOR_H2 = 45;\r
-       private int SCORING_FOR_H3 = 40;\r
-       private int SCORING_FOR_H4 = 35;\r
-       private int SCORING_FOR_H5 = 30;\r
-       private int SCORING_FOR_H6 = 25;\r
-       private int SCORING_FOR_BOLD = 5;\r
-       private int SCORING_FOR_ITALIC = 3;\r
-       private int SCORING_FOR_NORMAL_TEXT = 1;\r
-       private int SCORING_FOR_KEYWORD = 100;\r
-       private int SCORING_FOR_INDEXTERM = 75;\r
-       \r
-       /**\r
-        * The list with the word and scoring object\r
-        */\r
-       private List<WordAndScoring> wsList = null;\r
-\r
-       /**\r
-        * Used for Oxygen TestCases\r
-        * @return the wsList\r
-        */\r
-       public List<WordAndScoring> getWsList() {\r
-               return wsList;\r
-       }\r
-       // END OXYGEN PATCH\r
-       //methods\r
-       /**\r
-        * Constructor\r
-        */\r
-       public SaxHTMLIndex () {\r
-               super();\r
-       }\r
-       /**\r
-        * Constructor\r
+    private Map<String, String> tempDico;\r
+    private int i = 0;\r
+    private ArrayList<String> cleanUpList = null;\r
+    private ArrayList<String> cleanUpPunctuation = null;\r
+\r
+    // START OXYGEN PATCH, scoring for HTML elements\r
+    private int SCORING_FOR_H1 = 50;\r
+    private int SCORING_FOR_H2 = 45;\r
+    private int SCORING_FOR_H3 = 40;\r
+    private int SCORING_FOR_H4 = 35;\r
+    private int SCORING_FOR_H5 = 30;\r
+    private int SCORING_FOR_H6 = 25;\r
+    private int SCORING_FOR_BOLD = 5;\r
+    private int SCORING_FOR_ITALIC = 3;\r
+    private int SCORING_FOR_NORMAL_TEXT = 1;\r
+    private int SCORING_FOR_KEYWORD = 100;\r
+    private int SCORING_FOR_INDEXTERM = 75;\r
+\r
+    /**\r
+     * The list with the word and scoring object\r
+     */\r
+    private List<WordAndScoring> wsList = null;\r
+\r
+    /**\r
+     * Used for Oxygen TestCases\r
+     *\r
+     * @return the wsList\r
+     */\r
+    public List<WordAndScoring> getWsList() {\r
+        return wsList;\r
+    }\r
+    // END OXYGEN PATCH\r
+    //methods\r
+\r
+    /**\r
+     * Constructor\r
+     */\r
+    public SaxHTMLIndex() {\r
+        super();\r
+    }\r
+\r
+    /**\r
+     * Constructor\r
+     *\r
      * @param cleanUpStrings\r
      */\r
-       public SaxHTMLIndex (ArrayList <String> cleanUpStrings) {\r
-               super();\r
-               cleanUpList = cleanUpStrings;\r
-       }\r
-       /**\r
-        * Constructor\r
+    public SaxHTMLIndex(ArrayList<String> cleanUpStrings) {\r
+        super();\r
+        cleanUpList = cleanUpStrings;\r
+    }\r
+\r
+    /**\r
+     * Constructor\r
+     *\r
      * @param cleanUpStrings\r
      * @param cleanUpChars\r
      */\r
-       public SaxHTMLIndex (ArrayList <String> cleanUpStrings, ArrayList <String> cleanUpChars) {\r
-               super();\r
-               cleanUpList = cleanUpStrings;\r
-               cleanUpPunctuation = cleanUpChars;\r
-       }\r
-\r
-       /**\r
-        * Initializer\r
+    public SaxHTMLIndex(ArrayList<String> cleanUpStrings, ArrayList<String> cleanUpChars) {\r
+        super();\r
+        cleanUpList = cleanUpStrings;\r
+        cleanUpPunctuation = cleanUpChars;\r
+    }\r
+\r
+    /**\r
+     * Initializer\r
+     *\r
      * @param tempMap\r
      */\r
-       public int init(Map<String,String> tempMap){\r
-               tempDico = tempMap;\r
-               return 0;\r
-       }\r
-\r
-       /**\r
-        * Parses the file to extract all the words for indexing and\r
-        * some data characterizing the file.\r
-        * @param file contains the fullpath of the document to parse\r
+    public int init(Map<String, String> tempMap) {\r
+        tempDico = tempMap;\r
+        return 0;\r
+    }\r
+\r
+    /**\r
+     * Parses the file to extract all the words for indexing and\r
+     * some data characterizing the file.\r
+     *\r
+     * @param file            contains the fullpath of the document to parse\r
      * @param indexerLanguage this will be used to tell the program which stemmer to be used.\r
-     * @param stem if true then generate js files with words stemmed\r
-        * @return a DitaFileInfo object filled with data describing the file\r
-        */\r
-       public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) {\r
-               //initialization\r
-               fileDesc = new DocFileInfo(file);\r
-               strbf = new StringBuffer("");\r
-\r
-               // Fill strbf by parsing the file\r
-               parseDocument(file);\r
-\r
-               String str = cleanBuffer(strbf);\r
-        str = str.replaceAll("\\s+"," ");   //there's still redundant spaces in the middle\r
+     * @param stem            if true then generate js files with words stemmed\r
+     * @return a DitaFileInfo object filled with data describing the file\r
+     */\r
+    public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) {\r
+        //initialization\r
+        fileDesc = new DocFileInfo(file);\r
+        strbf = new StringBuffer("");\r
+\r
+        // Fill strbf by parsing the file\r
+        parseDocument(file);\r
+\r
+        String str = cleanBuffer(strbf);\r
+        str = str.replaceAll("\\s+", " ");   //there's still redundant spaces in the middle\r
 //             System.out.println(file.toString()+" "+ str +"\n");\r
         // START OXYGEN PATCH\r
 //             String[] items = str.split("\\s");      //contains all the words in the array\r
@@ -151,12 +159,12 @@ public class SaxHTMLIndex extends SaxDocFileParser{
         // START OXYGEN PATCH, create the words and scoring list\r
 //        String[] tokenizedItems;\r
         // END OXYGEN PATCH\r
-        if(indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh")\r
-                || indexerLanguage.equalsIgnoreCase("ko")){\r
-                LinkedList<String> tokens = new LinkedList<String>();\r
-            try{\r
-               //EXM-21501 Oxygen patch, replace the extra "@@@"s.\r
-               str = str.replaceAll("@@@([^\\s]*)@@@", "");\r
+        if (indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh")\r
+                || indexerLanguage.equalsIgnoreCase("ko")) {\r
+            LinkedList<String> tokens = new LinkedList<String>();\r
+            try {\r
+                //EXM-21501 Oxygen patch, replace the extra "@@@"s.\r
+                str = str.replaceAll("@@@([^\\s]*)@@@", "");\r
                 CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
                 Reader reader = new StringReader(str);\r
                 TokenStream stream = analyzer.tokenStream("", reader);\r
@@ -179,29 +187,29 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                         }\r
 \r
                     }\r
-                               if (!found) {\r
-                                       wsList.add(ws);\r
-                               }\r
+                    if (!found) {\r
+                        wsList.add(ws);\r
+                    }\r
                 }\r
                 // START OXYGEN PATCH\r
                 //tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
                 // END OXYGEN PATCH\r
 \r
-            }catch (IOException ex){\r
-               // START OXYGEN PATCH\r
+            } catch (IOException ex) {\r
+                // START OXYGEN PATCH\r
 //                tokenizedItems = items;\r
-               // END OXYGEN PATCH\r
+                // END OXYGEN PATCH\r
                 System.out.println("Error tokenizing content using CJK Analyzer. IOException");\r
                 ex.printStackTrace();\r
             }\r
         } else {\r
             SnowballStemmer stemmer;\r
-            if(indexerLanguage.equalsIgnoreCase("en")){\r
-                 stemmer = new EnglishStemmer();\r
-            } else if (indexerLanguage.equalsIgnoreCase("de")){\r
-                stemmer= new GermanStemmer();\r
-            } else if (indexerLanguage.equalsIgnoreCase("fr")){\r
-                stemmer= new FrenchStemmer();\r
+            if (indexerLanguage.equalsIgnoreCase("en")) {\r
+                stemmer = new EnglishStemmer();\r
+            } else if (indexerLanguage.equalsIgnoreCase("de")) {\r
+                stemmer = new GermanStemmer();\r
+            } else if (indexerLanguage.equalsIgnoreCase("fr")) {\r
+                stemmer = new FrenchStemmer();\r
             } else {\r
                 stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used.\r
             }\r
@@ -210,10 +218,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
             StringTokenizer st = new StringTokenizer(str, " ");\r
             // Tokenize the string and populate the words and scoring list\r
             while (st.hasMoreTokens()) {\r
-                       String token  = st.nextToken();\r
-                       WordAndScoring ws = getWordAndScoring(token, stemmer, stem);\r
-                       if (ws != null) {\r
-                               boolean found = false;\r
+                String token = st.nextToken();\r
+                WordAndScoring ws = getWordAndScoring(token, stemmer, stem);\r
+                if (ws != null) {\r
+                    boolean found = false;\r
                     for (WordAndScoring aWsList : wsList) {\r
                         // If the stem of the current word is already in list,\r
                         // do not add the word in the list, just recompute scoring\r
@@ -224,11 +232,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                             break;\r
                         }\r
                     }\r
-                               if (!found) {\r
-                                       wsList.add(ws);\r
-                               }\r
-                       }                       \r
-               }        \r
+                    if (!found) {\r
+                        wsList.add(ws);\r
+                    }\r
+                }\r
+            }\r
 //            if(stemmer != null)             //If a stemmer available\r
 //                tokenizedItems = stemmer.doStem(items.toArray(new String[0]));\r
 //            else                            //if no stemmer available for the particular language\r
@@ -237,7 +245,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 \r
         }\r
 \r
-       /* for(String stemmedItem: tokenizedItems){\r
+        /* for(String stemmedItem: tokenizedItems){\r
             System.out.print(stemmedItem+"| ");\r
         }*/\r
 \r
@@ -250,140 +258,142 @@ public class SaxHTMLIndex extends SaxDocFileParser{
         Iterator<WordAndScoring> it = wsList.iterator();\r
         WordAndScoring s;\r
         while (it.hasNext()) {\r
-               s = it.next();\r
-               // Do not add results from 'toc.html'\r
-               if (s != null && tempDico.containsKey(s.getStem())) {\r
-                       String temp = tempDico.get(s.getStem());\r
-                       temp = temp.concat(",").concat(Integer.toString(i))\r
-                       // Concat also the scoring for the stem\r
-                       .concat("*").concat(Integer.toString(s.getScoring()))\r
-                       ;\r
-                       //System.out.println("temp="+s+"="+temp);\r
-                       tempDico.put(s.getStem(), temp);\r
-               }else if (s != null) {\r
-                    String temp = null;\r
-                    temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
-                    tempDico.put(s.getStem(), temp);\r
+            s = it.next();\r
+            // Do not add results from 'toc.html'\r
+            if (s != null && tempDico.containsKey(s.getStem())) {\r
+                String temp = tempDico.get(s.getStem());\r
+                temp = temp.concat(",").concat(Integer.toString(i))\r
+                        // Concat also the scoring for the stem\r
+                        .concat("*").concat(Integer.toString(s.getScoring()))\r
+                        ;\r
+                //System.out.println("temp="+s+"="+temp);\r
+                tempDico.put(s.getStem(), temp);\r
+            else if (s != null) {\r
+                String temp = null;\r
+                temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
+                tempDico.put(s.getStem(), temp);\r
             }\r
-               // END OXYGEN PATCH\r
+            // END OXYGEN PATCH\r
         }\r
 \r
         i++;\r
-               return fileDesc;\r
-       }\r
-\r
-       // START OXYGEN PATCH\r
-       /**\r
-        * Get the word, stem and scoring for the given token.\r
-        * @param token The token to parse.\r
-        * @param stemmer The stemmer.\r
-        * @param doStemming If true then generate js files with words stemmed.\r
-        * @return the word, stem and scoring for the given token.\r
-        */\r
-       private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) {\r
-               WordAndScoring wordScoring = null;\r
-               if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) {\r
-                       // Extract the word from token\r
-                       String word = token.substring(0, token.indexOf("@@@"));\r
-                       if (word.length() > 0) {\r
-                               // Extract the element name from token\r
-                               String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@"));\r
-                               // Compute scoring\r
-                               int scoring = SCORING_FOR_NORMAL_TEXT;\r
-                               if ("h1".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_H1;\r
-                               } else if ("h2".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_H2;\r
-                               } else if ("h3".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_H3;\r
-                               } else if ("h4".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_H4;\r
-                               }  else if ("h5".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_H5;\r
-                               } else if ("h6".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_H6;\r
-                               } else if ("em".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_ITALIC;\r
-                               } else if ("strong".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_BOLD;\r
-                               } else if ("meta_keywords".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_KEYWORD;\r
-                               } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {\r
-                                       scoring = SCORING_FOR_INDEXTERM;\r
-                               }\r
-                               // Get the stemmed word\r
-                               String stemWord = word;\r
-                               if (stemmer != null && doStemming) {\r
-                                        stemWord = stemmer.doStem(word);\r
-                               }\r
-                               wordScoring = new WordAndScoring(word, stemWord, scoring);\r
-                       }\r
-               } else {\r
-                       // The token contains only the word\r
-                       String stemWord = token;\r
-                       // Stem the word\r
-                       if (stemmer != null && doStemming) {\r
-                                stemWord = stemmer.doStem(token);\r
-                       }\r
-                       wordScoring = new WordAndScoring(token, stemWord, SCORING_FOR_NORMAL_TEXT);\r
-               }\r
-               return wordScoring;\r
-       }\r
-       // END OXYGEN PATCH\r
-\r
-       /**\r
-        * Cleans the string buffer containing all the text retrieved from\r
-        * the html file:  remove punctuation, clean white spaces, remove the words\r
-        * which you do not want to index.\r
-        * NOTE: You may customize this function:\r
-        * This version takes into account english and japanese. Depending on your\r
-        * needs,\r
-        * you may have to add/remove some characters/words through props files\r
-        *    or by modifying tte default code,\r
-        * you may want to separate the language processing (doc only in japanese,\r
-        * doc only in english, check the language metadata ...).\r
-        */\r
-       private String cleanBuffer (StringBuffer strbf) {\r
-               String str = strbf.toString().toLowerCase();\r
-               StringBuffer tempStrBuf = new StringBuffer("");\r
-               StringBuffer tempCharBuf = new StringBuffer("");\r
-               if ((cleanUpList == null) || (cleanUpList.isEmpty())){\r
-                       // Default clean-up\r
-\r
-                       // Should perhaps eliminate the words at the end of the table?\r
-                       tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
-                       tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
-                       tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");\r
-                       tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");\r
-                       tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
-                       tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
+        return fileDesc;\r
+    }\r
+\r
+    // START OXYGEN PATCH\r
+\r
+    /**\r
+     * Get the word, stem and scoring for the given token.\r
+     *\r
+     * @param token      The token to parse.\r
+     * @param stemmer    The stemmer.\r
+     * @param doStemming If true then generate js files with words stemmed.\r
+     * @return the word, stem and scoring for the given token.\r
+     */\r
+    private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) {\r
+        WordAndScoring wordScoring = null;\r
+        if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) {\r
+            // Extract the word from token\r
+            String word = token.substring(0, token.indexOf("@@@"));\r
+            if (word.length() > 0) {\r
+                // Extract the element name from token\r
+                String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@"));\r
+                // Compute scoring\r
+                int scoring = SCORING_FOR_NORMAL_TEXT;\r
+                if ("h1".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_H1;\r
+                } else if ("h2".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_H2;\r
+                } else if ("h3".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_H3;\r
+                } else if ("h4".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_H4;\r
+                } else if ("h5".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_H5;\r
+                } else if ("h6".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_H6;\r
+                } else if ("em".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_ITALIC;\r
+                } else if ("strong".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_BOLD;\r
+                } else if ("meta_keywords".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_KEYWORD;\r
+                } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {\r
+                    scoring = SCORING_FOR_INDEXTERM;\r
+                }\r
+                // Get the stemmed word\r
+                String stemWord = word;\r
+                if (stemmer != null && doStemming) {\r
+                    stemWord = stemmer.doStem(word);\r
+                }\r
+                wordScoring = new WordAndScoring(word, stemWord, scoring);\r
+            }\r
+        } else {\r
+            // The token contains only the word\r
+            String stemWord = token;\r
+            // Stem the word\r
+            if (stemmer != null && doStemming) {\r
+                stemWord = stemmer.doStem(token);\r
+            }\r
+            wordScoring = new WordAndScoring(token, stemWord, SCORING_FOR_NORMAL_TEXT);\r
+        }\r
+        return wordScoring;\r
+    }\r
+    // END OXYGEN PATCH\r
+\r
+    /**\r
+     * Cleans the string buffer containing all the text retrieved from\r
+     * the html file:  remove punctuation, clean white spaces, remove the words\r
+     * which you do not want to index.\r
+     * NOTE: You may customize this function:\r
+     * This version takes into account english and japanese. Depending on your\r
+     * needs,\r
+     * you may have to add/remove some characters/words through props files\r
+     * or by modifying tte default code,\r
+     * you may want to separate the language processing (doc only in japanese,\r
+     * doc only in english, check the language metadata ...).\r
+     */\r
+    private String cleanBuffer(StringBuffer strbf) {\r
+        String str = strbf.toString().toLowerCase();\r
+        StringBuffer tempStrBuf = new StringBuffer("");\r
+        StringBuffer tempCharBuf = new StringBuffer("");\r
+        if ((cleanUpList == null) || (cleanUpList.isEmpty())) {\r
+            // Default clean-up\r
+\r
+            // Should perhaps eliminate the words at the end of the table?\r
+            tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
+            tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
+            tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");\r
+            tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");\r
+            tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
+            tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
             tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");\r
 \r
-                       str = str.replaceFirst("Copyright ï¿½ 1998-2007 NexWave Solutions.", " ");\r
+            str = str.replaceFirst("Copyright ï¿½ 1998-2007 NexWave Solutions.", " ");\r
 \r
 \r
-                       //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
-                       // remove contiguous white charaters\r
-                       //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
-               }else {\r
-                       // Clean-up using the props files\r
-                       tempStrBuf.append("\\ba\\b");\r
+            //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
+            // remove contiguous white charaters\r
+            //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
+        else {\r
+            // Clean-up using the props files\r
+            tempStrBuf.append("\\ba\\b");\r
             for (String aCleanUp : cleanUpList) {\r
-                tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b");\r
+                tempStrBuf.append("|\\b").append(aCleanUp).append("\\b");\r
             }\r
-               }\r
-               if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){\r
-                       tempCharBuf.append("\\u3002");\r
+        }\r
+        if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())) {\r
+            tempCharBuf.append("\\u3002");\r
             for (String aCleanUpPunctuation : cleanUpPunctuation) {\r
                 tempCharBuf.append("|").append(aCleanUpPunctuation);\r
             }\r
-               }\r
+        }\r
 \r
-               str = minimalClean(str, tempStrBuf, tempCharBuf);\r
-               return str;\r
-       }\r
+        str = minimalClean(str, tempStrBuf, tempCharBuf);\r
+        return str;\r
+    }\r
 \r
-       // OXYGEN PATCH, moved method in superclass\r
+    // OXYGEN PATCH, moved method in superclass\r
 //     private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
 //             String tempPunctuation = new String(tempCharBuf);\r
 //\r
@@ -413,6 +423,6 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 //                     str = str.replaceAll(tempPunctuation, " ");\r
 //             }               return str;\r
 //     }\r
-       // END OXYGEN PATCH\r
+    // END OXYGEN PATCH\r
 \r
 }\r
index 1aff3e93329450fbccaebc3596904fbc78ca71df..ed3ef269f54fa36b07751d8f4bbb0642d664bc50 100755 (executable)
@@ -4,9 +4,9 @@ package com.nexwave.nquindexer;
 */\r
 /**\r
  * For running tests with the indexertask.\r
- * \r
+ *\r
  * @version 2.0 2010-08-14\r
- * \r
+ *\r
  * @author N. Quaine\r
  * @author Kasun Gajasinghe\r
  *//*\r
@@ -15,9 +15,9 @@ package com.nexwave.nquindexer;
        public static IndexerTask IT = null; \r
        */\r
 /**\r
       * @param args\r
-        * @throws InterruptedException \r
       *//*\r
+ * @param args\r
+ * @throws InterruptedException\r
+ *//*\r
 \r
        public static void main(String[] args) throws InterruptedException {\r
         if (args.length != 0) {\r
index 329a21d34dcf80e7aa8f7175a9348dffadd07794..c34d4b8195e28102c781d6801e7ff23667ef094d 100755 (executable)
@@ -18,9 +18,9 @@ import com.nexwave.nsidita.DocFileInfo;
  * Outputs the js files with:
  * - the list of html files and their description
  * - the words retrieved from the html files and their location
- *
+ * <p/>
  * 20110803: Adding improvements from Radu/Oxygen.
- * 
+ *
  * @author N. Quaine
  * @author Kasun Gajasinghe
  * @version 2.0 2010-08-13
@@ -30,12 +30,14 @@ public class WriteJSFiles {
     private static String txt_VM_encoding_not_supported = "This VM does not support the specified encoding.";
     private static String txt_indices_location = "The created index files are located in ";
 
-       /** Create a javascript array listing the html files with their paths relative to the project root
-     * @param fileO path and name of the file in which to output the list of html files
-     * @param list  of the html files, relative to the doc root directory
-        * @param doStem If true then js files will generate words stemmed
+    /**
+     * Create a javascript array listing the html files with their paths relative to the project root
+     *
+     * @param fileO  path and name of the file in which to output the list of html files
+     * @param list   of the html files, relative to the doc root directory
+     * @param doStem If true then js files will generate words stemmed
      */
-       public static void WriteHTMLList (String fileO,ArrayList<String> list, boolean doStem) {
+    public static void WriteHTMLList(String fileO, ArrayList<String> list, boolean doStem) {
         int i = 0;
         Iterator it;
 
@@ -64,7 +66,7 @@ public class WriteJSFiles {
                 i++;
             }
 
-               out.write("var doStem = " + doStem + "");
+            out.write("var doStem = " + doStem + "");
             out.flush();  // Don't forget to flush!
             out.close();
 //             System.out.println("the array of html is in " + fileO);
@@ -79,8 +81,10 @@ public class WriteJSFiles {
 
     }
 
-       /** Create a javascript array listing the html files with 
+    /**
+     * Create a javascript array listing the html files with
      * their paths relative to project root, their titles and shortdescs
+     *
      * @param fileO path and name of the file in which to output the list of html files
      * @param list  of the html files, relative to the doc root directory
      */
@@ -120,27 +124,27 @@ public class WriteJSFiles {
                 if (tempTitle != null) {
                     tempTitle = tempTitle.replaceAll("\\s+", " ");
                     tempTitle = tempTitle.replaceAll("['�\"]", " ");
-                               //EXM-21239 Escape "\"
-                               tempTitle = tempTitle.replaceAll("\\\\", "\\\\\\\\");
+                    //EXM-21239 Escape "\"
+                    tempTitle = tempTitle.replaceAll("\\\\", "\\\\\\\\");
                 }
                 if (tempShortdesc != null) {
                     tempShortdesc = tempShortdesc.replaceAll("\\s+", " ");
                     tempShortdesc = tempShortdesc.replaceAll("['�\"]", " ");
-                               //EXM-21239 Escape "\"
-                               tempShortdesc = tempShortdesc.replaceAll("\\\\", "\\\\\\\\");
+                    //EXM-21239 Escape "\"
+                    tempShortdesc = tempShortdesc.replaceAll("\\\\", "\\\\\\\\");
                 }
-                       if (tempShortdesc != null) {
-                               String stripNonAlphabeticalChars = stripNonAlphabeticalChars(tempShortdesc);                    
-                               //stripNonAlphabeticalChars = stripWords(stripNonAlphabeticalChars);                            
-                               stripNonAlphabeticalChars = stripNonAlphabeticalChars + "...";                  
-                               out.write("fil[\""+i+"\"]"+"= \""+tempPath+"@@@"+tempTitle+"@@@"+stripNonAlphabeticalChars+"\";\n");
-                i++;
-                       }else{
-                               out.write("fil[\""+i+"\"]"+"= \""+tempPath+"@@@"+tempTitle+"@@@null"+"\";\n");
-                               i++;
+                if (tempShortdesc != null) {
+                    String stripNonAlphabeticalChars = stripNonAlphabeticalChars(tempShortdesc);
+                    //stripNonAlphabeticalChars = stripWords(stripNonAlphabeticalChars);
+                    stripNonAlphabeticalChars = stripNonAlphabeticalChars + "...";
+                    out.write("fil[\"" + i + "\"]" + "= \"" + tempPath + "@@@" + tempTitle + "@@@" + stripNonAlphabeticalChars + "\";\n");
+                    i++;
+                } else {
+                    out.write("fil[\"" + i + "\"]" + "= \"" + tempPath + "@@@" + tempTitle + "@@@null" + "\";\n");
+                    i++;
 
 
-                       }
+                }
             }
 
             out.flush();  // Don't forget to flush!
@@ -156,7 +160,9 @@ public class WriteJSFiles {
 
     }
 
-       /** Create javascript index files alphabetically.
+    /**
+     * Create javascript index files alphabetically.
+     *
      * @param fileOutStr      contains the path and the suffix of the index files to create.
      *                        The first letter of the key is added to the given suffix. For example: e.g. a.js, b.js etc...
      * @param indexMap        its keys are the indexed words and
@@ -228,30 +234,31 @@ public class WriteJSFiles {
     }
 
 
-       /**
-        * Remove all non alphabetical chars from the end of a text.
-        * @param input The text who will be striped.
-        * @return The striped text.
-        */
-       private static String stripNonAlphabeticalChars(String input) {
-               String output = input;
-               for (int i = input.length() - 1; i > 0 ; i--) {
-                       char charAt = input.charAt(i);
-                       int k = (int)charAt;
-                       if ((k > 65 && k < 91) || (k > 97 && k < 123) || (k > 48 && k < 58)) {
-                               return output;
-                       } else {
-                               output = output.substring(0, output.length() - 1);
-                       }
-               }
-               return output;
-       }
-       
-       private static String stripWords(String input) {
-               int idx = input.lastIndexOf(" ");
-               if (idx != -1) {
-                       return input.substring(0, idx);
-               } else {
+    /**
+     * Remove all non alphabetical chars from the end of a text.
+     *
+     * @param input The text who will be striped.
+     * @return The striped text.
+     */
+    private static String stripNonAlphabeticalChars(String input) {
+        String output = input;
+        for (int i = input.length() - 1; i > 0; i--) {
+            char charAt = input.charAt(i);
+            int k = (int) charAt;
+            if ((k > 65 && k < 91) || (k > 97 && k < 123) || (k > 48 && k < 58)) {
+                return output;
+            } else {
+                output = output.substring(0, output.length() - 1);
+            }
+        }
+        return output;
+    }
+
+    private static String stripWords(String input) {
+        int idx = input.lastIndexOf(" ");
+        if (idx != -1) {
+            return input.substring(0, idx);
+        } else {
                        return input;
                }
        }
index 5c487e9f3c454d95b6170df5de2aae1aeeeed8e5..3eb9a0cc4be13b094d692c19648f08f379a96201 100755 (executable)
@@ -2,32 +2,31 @@ package com.nexwave.nsidita;
 \r
 //import java.util.regex.;\r
 \r
-public class BlankRemover\r
-{\r
+public class BlankRemover {\r
 \r
     /* remove leading whitespace */\r
     public static String ltrim(String source) {\r
-        return (source==null)? null : source.replaceAll("^[\\s\u00A0]+", "");\r
+        return (source == null) ? null : source.replaceAll("^[\\s\u00A0]+", "");\r
     }\r
 \r
     /* remove trailing whitespace */\r
     public static String rtrim(String source) {\r
-       \r
-        return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", "");\r
+\r
+        return (source == null) ? null : source.replaceAll("[\\s\u00A0]+$", "");\r
     }\r
 \r
     /* replace multiple whitespace between words with single blank */\r
     public static String itrim(String source) {\r
-        return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");\r
+        return (source == null) ? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");\r
     }\r
 \r
     /* remove all superfluous whitespace in source string */\r
     public static String rmWhiteSpace(String source) {\r
-               //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");\r
-        return (source==null)? null : itrim(ltrim(rtrim(source)));\r
+        //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");\r
+        return (source == null) ? null : itrim(ltrim(rtrim(source)));\r
     }\r
 \r
-    public static String lrtrim(String source){\r
-        return (source==null)? null : ltrim(rtrim(source));\r
+    public static String lrtrim(String source) {\r
+        return (source == null) ? null : ltrim(rtrim(source));\r
     }\r
 }\r
index e24cda8e020183b4ec39558560d790441d3acda3..1ab29d887cf53c4fa51e7a9a58f80cb12cbfd05a 100755 (executable)
@@ -7,98 +7,101 @@ import java.util.Iterator;
 import java.util.regex.*;
 
 public class DirList {
-       
-       ArrayList<File> listFiles = null;
-       ArrayList<String> listFilesRelTo = null;
-       String [] topicFiles = null;
-       public static final int MAX_DEPTH = 10;
-    
-  public DirList(File inputDir, String regexp, int depth) {
-    try {
-      
-      listFiles = new ArrayList<File> ();
-       
-    // not yet implemented     
-      if(regexp == null) {
-          for (File f: inputDir.listFiles()) {
-                 if (!f.isDirectory()){
-                         listFiles.add(f);
-                 }else {
-                         if (depth < MAX_DEPTH ) {
-                               DirList nsiDoc = new DirList(f,regexp,depth+1);
-                               listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
-                         }
-                 }
-          }
-      }
-      else {
-          for (File f: inputDir.listFiles(new DirFilter(regexp))) {
-                 listFiles.add(f);
-          }
+
+    ArrayList<File> listFiles = null;
+    ArrayList<String> listFilesRelTo = null;
+    String[] topicFiles = null;
+    public static final int MAX_DEPTH = 10;
+
+    public DirList(File inputDir, String regexp, int depth) {
+        try {
+
+            listFiles = new ArrayList<File>();
+
+            // not yet implemented
+            if (regexp == null) {
+                for (File f : inputDir.listFiles()) {
+                    if (!f.isDirectory()) {
+                        listFiles.add(f);
+                    } else {
+                        if (depth < MAX_DEPTH) {
+                            DirList nsiDoc = new DirList(f, regexp, depth + 1);
+                            listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
+                        }
+                    }
+                }
+            } else {
+                for (File f : inputDir.listFiles(new DirFilter(regexp))) {
+                    listFiles.add(f);
+                }
 // Patch from Oxygen to address problem where directories
 // containing . were not traversed.
-          for (File f: inputDir.listFiles(new DirFilter(".*"))) {
-                 if (f.isDirectory()){
-                         if (depth < MAX_DEPTH ) {
-                               DirList nsiDoc = new DirList(f,regexp, depth+1);
-                               listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
-                         }
-                 }
-          }
-      }
-    } 
-    catch(Exception e) {
-       // TODO gerer exception
-     e.printStackTrace();
+                for (File f : inputDir.listFiles(new DirFilter(".*"))) {
+                    if (f.isDirectory()) {
+                        if (depth < MAX_DEPTH) {
+                            DirList nsiDoc = new DirList(f, regexp, depth + 1);
+                            listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
+                        }
+                    }
+                }
+            }
+        }
+        catch (Exception e) {
+            // TODO gerer exception
+            e.printStackTrace();
+        }
+    }
+
+    public ArrayList<File> getListFiles() {
+        return this.listFiles;
+    }
+
+    /**
+     * Calculate the path of the files already listed relative to projectDir
+     *
+     * @param projectDir Root from where to calculate the relative path
+     * @return The list of files with their relative path
+     */
+    public ArrayList<String> getListFilesRelTo(String projectDir) {
+        Iterator it;
+
+        if (this.listFiles == null) return null;
+
+        listFilesRelTo = new ArrayList<String>();
+        it = this.listFiles.iterator();
+        while (it.hasNext()) {
+            File ftemp = (File) it.next();
+            String stemp = ftemp.getPath();
+            int i = stemp.indexOf(projectDir);
+            if (i != 0) {
+                System.out.println("the documentation root does not match with the documentation input!");
+                return null;
+            }
+            int ad = 1;
+            if (stemp.equals(projectDir)) ad = 0;
+            stemp = stemp.substring(i + projectDir.length() + ad);
+            listFilesRelTo.add(stemp);
+        }
+        return this.listFilesRelTo;
     }
-  }
-  
-  public ArrayList<File> getListFiles() {
-         return this.listFiles;
-  }
- /**
-  * Calculate the path of the files already listed relative to projectDir
-  * @param projectDir Root from where to calculate the relative path
-  * @return The list of files with their relative path
-  */ 
-  public ArrayList<String> getListFilesRelTo(String projectDir) {
-       Iterator it;
-       
-       if (this.listFiles == null) return null;
-       
-       listFilesRelTo =  new ArrayList<String>();
-       it = this.listFiles.iterator ( ) ;
-       while ( it.hasNext ( ) ) {
-               File ftemp = (File) it.next();
-               String stemp = ftemp.getPath();
-               int i = stemp.indexOf(projectDir);
-               if ( i != 0 ) {
-                       System.out.println("the documentation root does not match with the documentation input!");
-                       return null;
-               }
-               int ad = 1;
-               if (stemp.equals(projectDir)) ad = 0; 
-               stemp = stemp.substring(i+projectDir.length()+ad);
-               listFilesRelTo.add(stemp);
-       }
-       return this.listFilesRelTo;
-  }
 
 }
 
 class DirFilter implements FilenameFilter {
-       private Pattern pattern;
-       public DirFilter(String regex) {
-           pattern = Pattern.compile(regex);
-         }
-         public boolean accept(File dir, String name) {
-                 String thisname = new File(name).getName();
-                 //System.out.println("Testing: "+ thisname);
-                 if(thisname.equals("index.html") || thisname.equals("ix01.html")){
-                         return false;
-                 }else{
-                         // Strip path information, search for regex:
-                         return pattern.matcher(new File(name).getName()).matches();
-                 }
-         }
+    private Pattern pattern;
+
+    public DirFilter(String regex) {
+        pattern = Pattern.compile(regex);
+    }
+
+    public boolean accept(File dir, String name) {
+        String thisname = new File(name).getName();
+        //System.out.println("Testing: "+ thisname);
+        if (thisname.equals("index.html") || thisname.equals("ix01.html")) {
+            return false;
+        } else {
+            // Strip path information, search for regex:
+            return pattern.matcher(new File(name).getName()).matches();
+        }
+    }
 } 
index a34fe0d6e3bf24f62b277dcd9574c6e24f15e663..446a31678a365f1ab3a403f3005ad432dc165c78 100755 (executable)
@@ -1,62 +1,63 @@
 package com.nexwave.nsidita;\r
 \r
 import java.io.File;\r
+\r
 /**\r
  * Object for describing a dita or html file.\r
- * \r
- * @version 2.0 2010-08-14\r
- * \r
+ *\r
  * @author N. Quaine\r
+ * @version 2.0 2010-08-14\r
  */\r
 public class DocFileInfo {\r
-       File fullpath = null;\r
-       String title = null;\r
-       String shortdesc = null;\r
-       String relpathToDocRep = null; //relative path to doc repository (ex: tasks/nexbuilder)\r
-       String deltaPathToDocRep = null; // distance from the doc repository (ex: ../..)\r
-\r
-       // default constructor\r
-       public DocFileInfo() {\r
-       }\r
-       \r
-       public DocFileInfo(File file) {\r
-               fullpath = file;\r
-       }\r
-       \r
-       public DocFileInfo(DocFileInfo info) {\r
-               this.fullpath = info.fullpath;\r
-               this.title = info.title;\r
-               this.shortdesc = info.shortdesc;\r
-       }\r
-       \r
-       public void setTitle (String title){\r
-               this.title = title;\r
-       }\r
-\r
-       public void setShortdesc (String shortDesc){\r
-               this.shortdesc = shortDesc;\r
-       }\r
-\r
-       /**\r
-        * @return the shortdesc\r
-        */\r
-       public String getShortdesc() {\r
-               return shortdesc;\r
-       }\r
-\r
-       /**\r
-        * @return the title\r
-        */\r
-       public String getTitle() {\r
-               return title;\r
-       }\r
-\r
-       public File getFullpath() {\r
-               return fullpath;\r
-       }\r
-\r
-       public void setFullpath(File fullpath) {\r
-               this.fullpath = fullpath;\r
-       }\r
+    File fullpath = null;\r
+    String title = null;\r
+    String shortdesc = null;\r
+    String relpathToDocRep = null; //relative path to doc repository (ex: tasks/nexbuilder)\r
+    String deltaPathToDocRep = null; // distance from the doc repository (ex: ../..)\r
+\r
+    // default constructor\r
+\r
+    public DocFileInfo() {\r
+    }\r
+\r
+    public DocFileInfo(File file) {\r
+        fullpath = file;\r
+    }\r
+\r
+    public DocFileInfo(DocFileInfo info) {\r
+        this.fullpath = info.fullpath;\r
+        this.title = info.title;\r
+        this.shortdesc = info.shortdesc;\r
+    }\r
+\r
+    public void setTitle(String title) {\r
+        this.title = title;\r
+    }\r
+\r
+    public void setShortdesc(String shortDesc) {\r
+        this.shortdesc = shortDesc;\r
+    }\r
+\r
+    /**\r
+     * @return the shortdesc\r
+     */\r
+    public String getShortdesc() {\r
+        return shortdesc;\r
+    }\r
+\r
+    /**\r
+     * @return the title\r
+     */\r
+    public String getTitle() {\r
+        return title;\r
+    }\r
+\r
+    public File getFullpath() {\r
+        return fullpath;\r
+    }\r
+\r
+    public void setFullpath(File fullpath) {\r
+        this.fullpath = fullpath;\r
+    }\r
 \r
 }\r