From d0dd98c4c702af9b29372c303844c4c6c14b4109 Mon Sep 17 00:00:00 2001
From: Kasun Gajasinghe <kasunbg@gmail.com>
Date: Fri, 9 Sep 2011 17:52:26 +0000
Subject: [PATCH] optimizing the code

---
 .../com/nexwave/nquindexer/IndexerMain.java   |   3 +-
 .../com/nexwave/nquindexer/IndexerTask.java   | 372 ------------------
 .../nexwave/nquindexer/SaxDocFileParser.java  | 223 +++++------
 .../com/nexwave/nquindexer/SaxHTMLIndex.java  | 107 ++---
 .../src/com/nexwave/nsidita/BlankRemover.java |   4 +-
 .../src/com/nexwave/nsidita/DirList.java      |  14 +-
 6 files changed, 172 insertions(+), 551 deletions(-)
 delete mode 100755 xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java
diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java
index 8b94207ba..63cbe9c93 100644
--- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java
+++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java
@@ -66,7 +66,7 @@ public class IndexerMain {
 
     /**
      * The content language defaults to English "en" 
-     * @param htmlDir The directory where html files resides.
+     * @param htmlDir The directory where html files reside.
      */
     public IndexerMain(String htmlDir) {
         super();
@@ -338,7 +338,6 @@ public class IndexerMain {
                 System.out.println("Delay = " + diff / 1000 + " seconds");
         } else {
             System.out.println(txt_wrong_dita_basedir);
-            return;
         }
     }
 
diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java
deleted file mode 100755
index 373e89d01..000000000
--- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java
+++ /dev/null
@@ -1,372 +0,0 @@
-/*
-package com.nexwave.nquindexer;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.Properties;
-
-import org.apache.tools.ant.BuildException;
-import org.apache.tools.ant.Task;
-
-
-import com.nexwave.nsidita.DirList;
-import com.nexwave.nsidita.DocFileInfo;
-
-*/
-/**
- * Indexer ant task.
- * 
- * @version 1.0 2008-02-26
- * 
- * @author N. Quaine
- * @author Kasun Gajasinghe <http://kasunbg.blogspot.com>
- *//*
-
-public class IndexerTask extends Task {
-
-	// messages
-	private String txt_no_inputdir = "Input directory not found:";
-	private String txt_cannot_create_outputdir = "Cannot create output search directory.";
-	private String txt_no_files_found = "No html files found.";
-	private String txt_wrong_dita_basedir = "ERROR: Parser initialization failed. Wrong dita base dir";
-	private String txt_no_relative_files_found= "No relative html files calculated.";
-	private String txt_no_words_gathered= "No words have been indexed in";
-	private String txt_no_html_files="No HTML Files found in";
-	private String txt_no_args="No argument given: you must provide an htmlDir to the IndexerTask";
-	
-	//working directories
-	private String searchdir = "search";
-	private File inputDir = null;
-	private String outputDir = null;
-	private String projectDir = null;
-
-	// ANT parameters
-	private String htmlDir=null;
-    public static String indexerLanguage="en";
-
-    //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK,
-    // as stemmers doesn't find a difference between them.
-    private String[] supportedLanguages= {"en", "de", "fr", "zh", "ja", "ko"}; //currently extended support available for
-                // English, German, French and CJK (Chinese [zh], Japanese [ja], Korean [ko]) languages only.
-
-	// Indexing features: words to remove
-	private ArrayList<String> cleanUpStrings = null;	
-	private ArrayList<String> cleanUpChars = null;
-
-	//Html extension
-	private String htmlExtension = "html";
-	
-	// Constructor
-	public IndexerTask() {
-		super();
-	}
-	*/
-/** The setter for the "htmlDir" attribute (parameter of the task)
-	 * @param htmldir
-	 * @throws InterruptedException 
-	 *//*
-
-    public void setHtmlDir(String htmlDir) {
-        this.htmlDir = htmlDir;
-    }
-
-     */
-/**
-     * Set the extension in which html files are generated
-     * @param htmlExtension The extension in wich html files are generated
-     *//*
-
-    public void setHtmlextension(String htmlExtension) {
-		this.htmlExtension = htmlExtension;
-		//Trim the starting "."
-		if(this.htmlExtension.startsWith(".")) {
-			this.htmlExtension = this.htmlExtension.substring(1);
-		}
-	}
-
-    */
-/**
-     * setter for "indexerLanguage" attribute from ANT
-     * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used.
-     * @throws InterruptedException for ant
-     *//*
-
-    public void setIndexerLanguage(String indexerLanguage){
-        if(indexerLanguage !=null && !"".equals(indexerLanguage)) {
-            int temp = indexerLanguage.indexOf('_');
-            if( temp != -1){
-                indexerLanguage = indexerLanguage.substring(0,temp);
-            }
-            int i=0;
-            for (;i<supportedLanguages.length;i++) {
-                if(indexerLanguage.equals(supportedLanguages[i])){
-                    IndexerTask.indexerLanguage = supportedLanguages[i];
-                    break;
-                }
-            }
-            
-            //if not in supported language list,
-            if(i>=supportedLanguages.length){
-//                System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " +
-//                        "searching. Check documentation for details. ");
-                IndexerTask.indexerLanguage = indexerLanguage;
-            } 
-        } else {
-            IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point.
-        } 
-    }
-	
-	*/
-/**
-	 * Implementation of the execute function (Task interface)
-	 *//*
-
-	public void execute() throws BuildException {
-        try{
-            //Use Xerces as the parser. Does not support Saxon6.5.5 parser 
-           System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser");
-           System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.xerces.jaxp.SAXParserFactoryImpl");
-//           System.setProperty("org.xml.sax.driver", "com.icl.saxon.aelfred.SAXDriver");
-//           System.setProperty("javax.xml.parsers.SAXParserFactory", "com.icl.saxon.aelfred.SAXParserFactoryImpl");
-        } catch (SecurityException se){
-            System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " +
-                    "is not in your CLASSPATH.");
-        } catch (Exception e){
-            System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " +
-                    "is not in your CLASSPATH");
-        }
-
-		ArrayList<DocFileInfo> filesDescription = null; // list of information about the topic files
-		ArrayList<File> htmlFiles = null; // topic files listed in the given directory
-		ArrayList<String> htmlFilesPathRel = null;
-		Map<String, String> tempDico = new HashMap<String, String>(); 
-		Iterator it;
-		
-		//File name initialization
-		String htmlList = "htmlFileList.js";
-		String htmlInfoList = "htmlFileInfoList.js";
-		String indexName = ".js";
-		
-		//timing
-		Date dateStart = new Date();
-		
-		if (htmlDir == null) {
-			System.out.println(txt_no_args + ".");
-			return;
-		}
-		// Init input directory
-		inputDir = new File(htmlDir);
-
-		// Begin of init
-		// check if inputdir initialized
-		if (inputDir == null) {
-			DisplayHelp();
-			return;
-		}
-		
-		// check if inputdir exists		
-		if (!inputDir.exists()) {
-			System.out.println(txt_no_inputdir + " "+ inputDir + ".");
-			return;
-		}
-		
-		// check if outputdir defined
-		if (outputDir == null) {
-            //set the output directory: path= {inputDir}/search 
-			outputDir = inputDir.getPath().concat(File.separator).concat(searchdir);
-		}
-
-		// check if outputdir exists
-		File tempfile = new File(outputDir); 
-		if (!tempfile.exists()) {
-			boolean b = (new File(outputDir)).mkdir();
-			if (!b) {
-				System.out.println(txt_cannot_create_outputdir + " "+ outputDir + ".");
-				return;
-			}
-		}
-		
-		// check if projdir is defined
-		if (projectDir == null) {
-			projectDir = inputDir.getPath();
-		}
-		//end of init
-		
-
-		// Get the list of all html files but the tocs, covers and indexes
-        DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1);
-		htmlFiles = nsiDoc.getListFiles();
-		// Check if found html files
-		if (htmlFiles.isEmpty()) {
-			System.out.println(txt_no_html_files + " "+ inputDir + ".");
-			return;
-		}
-		// Get the list of all html files with relative paths 
-		htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir);
-		
-		if (htmlFiles == null) {
-			System.out.println(txt_no_files_found);
-			return;
-		} else if (htmlFilesPathRel == null) {
-			System.out.println(txt_no_relative_files_found);
-			return;			
-		}
-		
-		// Create the list of the existing html files (index starts at 0)
-		WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel);
-		
-		// Parse each html file to retrieve the words:
-		// ------------------------------------------
-		
-		// Retrieve the clean-up properties for indexing
-		RetrieveCleanUpProps();
-	   	// System.out.print("clean"+" " +cleanUpStrings);
-	    
-		//create a default handler
-		//SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files
-		//SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings); // use clean-up props files
-		SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings, cleanUpChars); // use clean-up props files
-
-		if ( spe.init(tempDico) == 0 ) {
-
-			//create a html file description list
-			filesDescription = new ArrayList <DocFileInfo> ();
-			
-			it = htmlFiles.iterator ( ) ;
-			
-			// parse each html files
-			while ( it.hasNext ( ) ) {
-				File ftemp = (File) it.next();
-				//tempMap.put(key, value);
-				//The HTML file information are added in the list of FileInfoObject
-				DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage));
-				
-				ftemp = docFileInfoTemp.getFullpath();
-				String stemp = ftemp.toString();              
-				int i = stemp.indexOf(projectDir);
-				if ( i != 0 ) {
-					System.out.println("the documentation root does not match with the documentation input!");
-					return;
-				}
-				int ad = 1;
-				if (stemp.equals(projectDir)) ad = 0; 
-				stemp = stemp.substring(i+projectDir.length()+ad);  //i is redundant (i==0 always)
-				ftemp = new File (stemp);
-				docFileInfoTemp.setFullpath(ftemp);
-				
-				filesDescription.add(docFileInfoTemp);
-			}
-			*/
-/*remove empty strings from the map*//*
-
-			if (tempDico.containsKey("")) {
-				tempDico.remove("");
-			}
-			// write the index files
-			if (tempDico.isEmpty()) {
-				System.out.println(txt_no_words_gathered + " "+ inputDir + ".");
-				return;
-			}
-			
-			WriteJSFiles.WriteIndex(outputDir.concat(File.separator).concat(indexName), tempDico);
-			
-			// write the html list file with title and shortdesc
-			//create the list of the existing html files (index starts at 0)
-			WriteJSFiles.WriteHTMLInfoList(outputDir.concat(File.separator).concat(htmlInfoList), filesDescription);
-			
-			//perf measurement
-			Date dateEnd = new Date();
-			long diff = dateEnd.getTime() - dateStart.getTime();
-            if(diff<1000)
-			    System.out.println("Delay = " + diff + " milliseconds");
-            else
-                System.out.println("Delay = " + diff/1000 + " seconds");
-		}else {
-			System.out.println(txt_wrong_dita_basedir);
-			return;
-		}
-	}
-	
-	*/
-/**
-     * Prints the usage information for this class to <code>System.out</code>.
-     *//*
-
-    private static void DisplayHelp() {
-    	String lSep = System.getProperty("line.separator");
-        StringBuffer msg = new StringBuffer();
-        msg.append("USAGE:" + lSep);        
-        msg.append("   java -classpath TesterIndexer inputDir outputDir projectDir" + lSep);
-        msg.append("with:" + lSep);
-        msg.append("   inputDir (mandatory) :  specify the html files ' directory to index" + lSep);
-        msg.append("   outputDir (optional) : specify where to output the index files" + lSep);
-        msg.append("   projectDir (optional) : specify the root of the documentation directory" + lSep);
-        msg.append("Example:" + lSep);
-        msg.append("   java -classpath TesterIndexer /home/$USER/DITA/doc" + lSep);
-        msg.append("Example 2:" + lSep);
-        msg.append("   java -classpath TesterIndexer /home/$USER/DITA/doc/customer/concepts /home/$USER/temp/search /home/$USER/DITA/doc/" + lSep);
-        System.out.println(msg.toString());
-    }
-    private int RetrieveCleanUpProps (){
-
-    	// Files for punctuation (only one for now)
-        String[] punctuationFiles = new String[] {"punctuation.props"};
-        FileInputStream input;
-        String tempStr;
-        File ftemp;
-        Collection c = new ArrayList<String>();
-
-        // Get the list of the props file containing the words to remove (not the punctuation)
-        DirList props = new DirList(inputDir, "^(?!(punctuation)).*\\.props$", 1);
-		ArrayList<File> wordsList = props.getListFiles();
-//		System.out.println("props files:"+wordsList);
-        //TODO all properties are taken to a single arraylist. does it ok?.
-		Properties enProps =new Properties ();
-		String propsDir = inputDir.getPath().concat(File.separator).concat(searchdir);
-		
-		// Init the lists which will contain the words and chars to remove 
-		cleanUpStrings = new ArrayList<String>();
-		cleanUpChars = new ArrayList<String>();
-		
-	    try {
-	    	// Retrieve words to remove
-            for (File aWordsList : wordsList) {
-                ftemp = aWordsList;
-                if (ftemp.exists()) {
-                    enProps.load(input = new FileInputStream(ftemp.getAbsolutePath()));
-                    input.close();
-                    c = enProps.values();
-                    cleanUpStrings.addAll(c);
-                    enProps.clear();
-                }
-            }
-
-	    	// Retrieve char to remove (punctuation for ex.)
-            for (String punctuationFile : punctuationFiles) {
-                tempStr = propsDir.concat(File.separator).concat(punctuationFile);
-                ftemp = new File(tempStr);
-                if (ftemp.exists()) {
-                    enProps.load(input = new FileInputStream(tempStr));
-                    input.close();
-                    c = enProps.values();
-                    cleanUpChars.addAll(c);
-                    enProps.clear();
-                }
-            }
-	    }
-	    catch (IOException e) {
-	        e.printStackTrace();
-	        return 1;
-	    }
-    	return 0;
-    }
-
-}
-*/
diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
index ca808d529..a24cc1855 100755
--- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
+++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java
@@ -13,14 +13,14 @@ import org.xml.sax.SAXParseException;
 
 /**
  * Generic parser for populating a DocFileInfo object.
- * 
+ *
  * @version 2.0 2010-08-14
- * 
+ *
  * @author N. Quaine
  * @author Kasun Gajasinghe
  */
 public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
-	
+
 	//members
 	protected DocFileInfo fileDesc = null;
 	protected String projectDir = null;
@@ -39,7 +39,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 	public SaxDocFileParser () {
 
 	}
-	
+
 	/**
 	 * Initializer
 	 */
@@ -48,16 +48,16 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 	}
 
 	/**
-	 * Parses the file to extract all the words for indexing and 
-	 * some data characterizing the file. 
-	 * @param file contains the fullpath of the document to parse  
+	 * Parses the file to extract all the words for indexing and
+	 * some data characterizing the file.
+	 * @param file contains the fullpath of the document to parse
 	 * @return a DitaFileInfo object filled with data describing the file
 	 */
 	public DocFileInfo runExtractData(File file) {
 		//initialization
 		fileDesc = new DocFileInfo(file);
 		strbf = new StringBuffer("");
-		
+
 		// Fill strbf by parsing the file
 		parseDocument(file);
 
@@ -67,7 +67,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 	public void parseDocument (File file) {
 //        System.out.println(System.getProperty("org.xml.sax.driver"));
 //        System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory"));
-        
+
 		//get a factory
 		javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();
 
@@ -83,7 +83,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 
             //parse the file and also register this class for call backs
 			//System.out.println("Parsing: " + file);
-			
+
 			long start = System.currentTimeMillis();
 			//System.out.println("about to parse " + file.getName() + " >>> " + start);
 
@@ -93,25 +93,25 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 				is.setSystemId(file.toURI().toURL().toString());
 			    sp.parse(is, this);
 			}
-			
+
 			long finish = System.currentTimeMillis();
 			//System.out.println("done parsing " + file.getName() + " >>> " + finish);
 			//System.out.println("time = " + (finish - start) + " milliseconds");
-			
+
 		}catch(SAXParseException spe){
             System.out.println("SaxParseException: The indexing file contains incorrect xml syntax.");
             spe.printStackTrace();
         }catch(org.xml.sax.SAXException se) {
 			System.out.println("SaxException. You may need to include Xerces in your classpath. " +
                     "See documentation for details");
-			se.printStackTrace(); 
+			se.printStackTrace();
 		}catch(javax.xml.parsers.ParserConfigurationException pce) {
 			pce.printStackTrace();
 		}catch (IOException ie) {
 			ie.printStackTrace();
 		}
 	}
-    
+
     private boolean addContent = false;
     private boolean addHeaderInfo = false;
     private boolean doNotIndex=false;
@@ -129,26 +129,26 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 		if((qName.equalsIgnoreCase("meta")) ) {
             addHeaderInfo = true;
 			String attrName = attributes.getValue("name");
-			// OXYGEN PATCH START EXM-20576 - add scoring for keywords
-			if(attrName != null && (attrName.equalsIgnoreCase("keywords") 
-				|| attrName.equalsIgnoreCase("description")
-				|| attrName.equalsIgnoreCase("indexterms")
-				)){
-			    if (attrName.equalsIgnoreCase("keywords")) {
-			        String[] keywords = attributes.getValue("content").split(", ");
-				for (int i = 0; i < keywords.length; i++) {
-				    strbf.append(" " + keywords[i] + "@@@elem_meta_keywords@@@ ");
-				}
-			    } else if (attrName.equalsIgnoreCase("indexterms")) {
-			        String[] indexterms = attributes.getValue("content").split(", ");
-				for (int i = 0; i < indexterms.length; i++) {
-				    strbf.append(" " + indexterms[i] + "@@@elem_meta_indexterms@@@ ");
-				}
-			    } else {
-				strbf.append(" " + attributes.getValue("content") + " ");
-			    }
-			} 
-			// OXYGEN PATCH END EXM-20576 - add scoring for indexterms
+			// OXYGEN PATCH START EXM-20576 - add scoring for keywords
+			if(attrName != null && (attrName.equalsIgnoreCase("keywords")
+				|| attrName.equalsIgnoreCase("description")
+				|| attrName.equalsIgnoreCase("indexterms")
+				)){
+			    if (attrName.equalsIgnoreCase("keywords")) {
+			        String[] keywords = attributes.getValue("content").split(", ");
+                    for (String keyword : keywords) {
+                        strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ ");
+                    }
+			    } else if (attrName.equalsIgnoreCase("indexterms")) {
+			        String[] indexterms = attributes.getValue("content").split(", ");
+                    for (String indexterm : indexterms) {
+                        strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ ");
+                    }
+			    } else {
+				strbf.append(" ").append(attributes.getValue("content") ).append(" ");
+			    }
+			}
+			// OXYGEN PATCH END EXM-20576 - add scoring for indexterms
 			// dwc: adding this to make the docbook <abstract> element
 			// (which becomes <meta name="description".../> in html)
 			// into the brief description that shows up in search
@@ -163,13 +163,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 			tempVal = new StringBuffer();
 		}
 
-        if(qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc")){
-            addHeaderInfo = true;
-        } else {
-            addHeaderInfo = false;
-        }
+        addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc");
 
-        String elementId = attributes.getValue("id"); 
+        String elementId = attributes.getValue("id");
         if("content".equals(elementId)) addContent = true;
 
         if(addContent) {
@@ -193,11 +189,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
             }
 
             String accessKey = attributes.getValue("accesskey");
-            if(accessKey!=null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey))){
-                doNotIndex = true;
-            } else {
-                doNotIndex = false;
-            }
+            doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey));
         }
 		strbf.append(" ");
 	}
@@ -207,7 +199,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 
 		// index certain elements. E.g. Use this to implement a
 		// "titles only" index,
-        
+
         //OXYGEN PATCH, gather more keywords.
 		if(
 //				(addContent || addHeaderInfo) && 
@@ -221,64 +213,64 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 			// Do a minimal clean
 			text = minimalClean(text, null, null);
 			text = text.replaceAll("\\s+"," ");
-			String marker = "@@@elem_" + stack.peek() + "@@@ ";
-			Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);
+			String marker = "@@@elem_" + stack.peek() + "@@@ ";
+			Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);
 			if (text.trim().length() > 0 && m.find()) {
-			    String copyText = new String(originalText);
-			    text = duplicateWords(copyText, text, "-");
-			    copyText = new String(originalText);
-			    text = duplicateWords(copyText, text, ":");
-			    copyText = new String(originalText);
-			    text = duplicateWords(copyText, text, ".");
+			    String copyText = new String(originalText);
+			    text = duplicateWords(copyText, text, "-");
+			    copyText = new String(originalText);
+			    text = duplicateWords(copyText, text, ":");
+			    copyText = new String(originalText);
+			    text = duplicateWords(copyText, text, ".");
 				// Replace whitespace with the marker
 				text = text.replace(" ", marker);
 				text = text + marker;
 			}
 			// END OXYGEN PATCH
 			strbf.append(text);
-//			System.out.println("=== marked text: " + text);
+//			System.out.println("=== marked text: " + text);
 			// START OXYGEN PATCH, append the original text
 			if (tempVal != null) { tempVal.append(originalText);}
 			// END OXYGEN PATCH
 		}
 	}
-	
-	// START OXYGEN PATCH EXM-20414
-	private String duplicateWords(String sourceText, String acumulator, String separator) {
-//	    System.out.println("sourceText: " + sourceText + "   separator: " + separator);
-	    int index = sourceText.indexOf(separator);
-	    while (index >= 0) {
-		int indexSpaceAfter = sourceText.indexOf(" ", index);
-		String substring = null;
-		if (indexSpaceAfter >= 0) {
-		    substring = sourceText.substring(0, indexSpaceAfter);
-		    sourceText = sourceText.substring(indexSpaceAfter);
-		} else {
-		    substring = sourceText;
-		    sourceText = "";
-		}
-		
-		int indexSpaceBefore = substring.lastIndexOf(" ");
-		if (indexSpaceBefore >= 0) {
-		    substring = substring.substring(indexSpaceBefore + 1);
-		}
-		if (separator.indexOf(".") >= 0) {
-		    separator = separator.replaceAll("\\.", "\\\\.");
-//		    System.out.println("++++++++++ separator: " + separator);
-		}
-		String[] tokens = substring.split(separator);
-
-		for (int i = 0; i < tokens.length; i++) {
-		    acumulator = acumulator + " " + tokens[i];
-//		    System.out.println("added token: " + tokens[i] + "  new text: " + acumulator);
-		}
-		
-		index = sourceText.indexOf(separator);
-	    }
-	    
-	    return acumulator;
-	}
-	// END OXYGEN PATCH EXM-20414
+
+	// START OXYGEN PATCH EXM-20414
+	private String duplicateWords(String sourceText, String acumulator, String separator) {
+//	    System.out.println("sourceText: " + sourceText + "   separator: " + separator);
+	    int index = sourceText.indexOf(separator);
+	    while (index >= 0) {
+		int indexSpaceAfter = sourceText.indexOf(" ", index);
+		String substring = null;
+		if (indexSpaceAfter >= 0) {
+		    substring = sourceText.substring(0, indexSpaceAfter);
+		    sourceText = sourceText.substring(indexSpaceAfter);
+		} else {
+		    substring = sourceText;
+		    sourceText = "";
+		}
+
+		int indexSpaceBefore = substring.lastIndexOf(" ");
+		if (indexSpaceBefore >= 0) {
+		    substring = substring.substring(indexSpaceBefore + 1);
+		}
+		if (separator.indexOf(".") >= 0) {
+		    separator = separator.replaceAll("\\.", "\\\\.");
+//		    System.out.println("++++++++++ separator: " + separator);
+		}
+		String[] tokens = substring.split(separator);
+
+            for (String token : tokens) {
+                acumulator = acumulator + " " + token;
+//		    System.out.println("added token: " + tokens[i] + "  new text: " + acumulator);
+            }
+
+            index = sourceText.indexOf(separator);
+	    }
+
+	    return acumulator;
+	}
+	// END OXYGEN PATCH EXM-20414
 	public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {
 		// START OXYGEN PATCH, remove element from stack
 		stack.pop();
@@ -292,31 +284,31 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 		else if (shortdescBool) {
 			shortTagCpt --;
 			if (shortTagCpt == 0) {
-				String shortdesc = tempVal.toString().replace('\n', ' ');
-				if(shortdesc.trim().length() > 0) {
-					fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));
-				}
+				String shortdesc = tempVal.toString().replace('\n', ' ');
+				if(shortdesc.trim().length() > 0) {
+					fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));
+				}
 			tempVal = null;
 			shortdescBool = false;
 			}
 		}
-        
+
         if(qName.equalsIgnoreCase("div") && addContent){
             divCount--;
             if (divCount == 0) {
                 addContent = false;
             }
-        } 
+        }
 	}
-	
+
 	public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {
 		//do nothing
-		
+
 	}
-	
-	/*public InputSource resolveEntity(String publicId, String systemId) 
+
+	/*public InputSource resolveEntity(String publicId, String systemId)
 		throws IOException, SAXException {
-		
+
 		// use the catalog to solve the doctype
 		System.out.println("entities " + publicId + systemId);
 		return null;
@@ -325,13 +317,13 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 	throws org.xml.sax.SAXException, IOException {
 		//System.out.println("Entities " + publicId + "and" + systemId);
 		// use dita ot (dost.jar) for resolving dtd paths using the calatog
-		
+
 	return null;
 	}
 
     /**
-     * Removes the validation in html files, such as xml version and DTDs  
-     * @param file
+     * Removes the validation in html files, such as xml version and DTDs
+     * @param file the html file
      * @return int: returns 0 if no IOException occurs, else 1.
      */
 	public String RemoveValidationPI (File file) {
@@ -348,36 +340,35 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 				int i1, i2;
 				boolean ok = true;
 				try {
-	
+
 					String line = br.readLine();
-	
-			        
+
 					if (line == null) {
 						break;
 					}
 					//ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");
-					
+
 					line = line.replaceAll("\\x26nbsp\\x3B", "&#160;");
-	
+
 					if (!line.contains("<!DOCTYPE html PUBLIC")) {
 						//dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.
 						if (line.contains("<?xml version")) {
 							line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");
 						}
 
-                        sb.append(line + "\n");
-					} else  
+                        sb.append(line).append("\n");
+					} else
 					{
 						//dwc: What is this trying to do? Nuke the DOCTYPE? Why?
 						i1 = line.indexOf("<!DOCTYPE");
 						i2 = line.indexOf(">", i1);
 						while (i2 < 0) {
-							
+
 							line = line.concat(br.readLine());
 							i2 = line.indexOf(">", i1);
 						}
 						String temp = line.substring(i1, i2);
-						
+
 						//ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");
 						if (line.contains("<?xml version")) {
 							line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");
@@ -399,7 +390,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {
 		{
 			return null;
 		}
-		
+
 		return sb.toString(); // return status
 
 	}
diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
index abac4aeff..c57bbf7cd 100755
--- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
+++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java
@@ -67,8 +67,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 	private int SCORING_FOR_BOLD = 5;
 	private int SCORING_FOR_ITALIC = 3;
 	private int SCORING_FOR_NORMAL_TEXT = 1;
-	private int SCORING_FOR_KEYWORD = 100;
-	private int SCORING_FOR_INDEXTERM = 75;
+	private int SCORING_FOR_KEYWORD = 100;
+	private int SCORING_FOR_INDEXTERM = 75;
 	
 	/**
 	 * The list with the word and scoring object
@@ -92,14 +92,17 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 	}
 	/**
 	 * Constructor
-	 */
+     * @param cleanUpStrings
+     */
 	public SaxHTMLIndex (ArrayList <String> cleanUpStrings) {
 		super();
 		cleanUpList = cleanUpStrings;
 	}
 	/**
 	 * Constructor
-	 */
+     * @param cleanUpStrings
+     * @param cleanUpChars
+     */
 	public SaxHTMLIndex (ArrayList <String> cleanUpStrings, ArrayList <String> cleanUpChars) {
 		super();
 		cleanUpList = cleanUpStrings;
@@ -108,7 +111,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 
 	/**
 	 * Initializer
-	 */
+     * @param tempMap
+     */
 	public int init(Map<String,String> tempMap){
 		tempDico = tempMap;
 		return 0;
@@ -142,8 +146,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
         //Do Stemming for words in items
         //TODO currently, stemming support is for english and german only. Add support for other languages as well.
 
-        // START OXYGEN PATCH
-        wsList = new ArrayList<WordAndScoring>();
+        // START OXYGEN PATCH
+        wsList = new ArrayList<WordAndScoring>();
         // START OXYGEN PATCH, create the words and scoring list
 //        String[] tokenizedItems;
         // END OXYGEN PATCH
@@ -151,8 +155,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 || indexerLanguage.equalsIgnoreCase("ko")){
                 LinkedList<String> tokens = new LinkedList<String>();
             try{
-            	//EXM-21501 Oxygen patch, replace the extra "@@@"s.
-            	str = str.replaceAll("@@@([^\\s]*)@@@", "");
+            	//EXM-21501 Oxygen patch, replace the extra "@@@"s.
+            	str = str.replaceAll("@@@([^\\s]*)@@@", "");
                 CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);
                 Reader reader = new StringReader(str);
                 TokenStream stream = analyzer.tokenStream("", reader);
@@ -162,23 +166,23 @@ public class SaxHTMLIndex extends SaxDocFileParser{
                 while (stream.incrementToken()) {
                     String term = termAtt.term();
                     tokens.add(term);
-                    WordAndScoring ws = new WordAndScoring(term, term, 1);
-                    boolean found = false;
-    				for (int i = 0; i < wsList.size(); i++) { 
-    					// If the stem of the current word is already in list, 
-    					// do not add the word in the list, just recompute scoring
-    					if (wsList.get(i).getStem().equals(ws.getStem())) {
-    						found = true;
-    						int scoring = wsList.get(i).getScoring();
-    						wsList.get(i).setScoring(scoring + ws.getScoring());
-    						break;
+                    WordAndScoring ws = new WordAndScoring(term, term, 1);
+                    boolean found = false;
+                    for (WordAndScoring aWsList : wsList) {
+                        // If the stem of the current word is already in list,
+                        // do not add the word in the list, just recompute scoring
+                        if (aWsList.getStem().equals(ws.getStem())) {
+                            found = true;
+                            int scoring = aWsList.getScoring();
+                            aWsList.setScoring(scoring + ws.getScoring());
+                            break;
+                        }
+
+                    }
+    				if (!found) {
+    					wsList.add(ws);
+    				}
                 }
-
-					}
-    				if (!found) {
-    					wsList.add(ws);
-    				}
-                }
                 // START OXYGEN PATCH
                 //tokenizedItems = tokens.toArray(new String[tokens.size()]);
                 // END OXYGEN PATCH
@@ -199,7 +203,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{
             } else if (indexerLanguage.equalsIgnoreCase("fr")){
                 stemmer= new FrenchStemmer();
             } else {
-                stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used.
+                stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used.
             }
             // START OXYGEN PATCH
             wsList = new ArrayList<WordAndScoring>();
@@ -210,16 +214,16 @@ public class SaxHTMLIndex extends SaxDocFileParser{
     			WordAndScoring ws = getWordAndScoring(token, stemmer, stem);
     			if (ws != null) {
     				boolean found = false;
-    				for (int i = 0; i < wsList.size(); i++) { 
-    					// If the stem of the current word is already in list, 
-    					// do not add the word in the list, just recompute scoring
-    					if (wsList.get(i).getStem().equals(ws.getStem())) {
-    						found = true;
-    						int scoring = wsList.get(i).getScoring();
-    						wsList.get(i).setScoring(scoring + ws.getScoring());
-    						break;
-    					}
-					}
+                    for (WordAndScoring aWsList : wsList) {
+                        // If the stem of the current word is already in list,
+                        // do not add the word in the list, just recompute scoring
+                        if (aWsList.getStem().equals(ws.getStem())) {
+                            found = true;
+                            int scoring = aWsList.getScoring();
+                            aWsList.setScoring(scoring + ws.getScoring());
+                            break;
+                        }
+                    }
     				if (!found) {
     					wsList.add(ws);
     				}
@@ -256,10 +260,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{
         		;
         		//System.out.println("temp="+s+"="+temp);
         		tempDico.put(s.getStem(), temp);
-        	}else {
-        		String temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));
-        		tempDico.put(s.getStem(), temp);
-        	}
+        	}else if (s != null) {
+                    String temp = null;
+                    temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));
+                    tempDico.put(s.getStem(), temp);
+            }
         	// END OXYGEN PATCH
         }
 
@@ -301,10 +306,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 					scoring = SCORING_FOR_ITALIC;
 				} else if ("strong".equalsIgnoreCase(elementName)) {
 					scoring = SCORING_FOR_BOLD;
-				} else if ("meta_keywords".equalsIgnoreCase(elementName)) {
-					scoring = SCORING_FOR_KEYWORD;
-				} else if ("meta_indexterms".equalsIgnoreCase(elementName)) {
-					scoring = SCORING_FOR_INDEXTERM;
+				} else if ("meta_keywords".equalsIgnoreCase(elementName)) {
+					scoring = SCORING_FOR_KEYWORD;
+				} else if ("meta_indexterms".equalsIgnoreCase(elementName)) {
+					scoring = SCORING_FOR_INDEXTERM;
 				}
 				// Get the stemmed word
 				String stemWord = word;
@@ -363,17 +368,15 @@ public class SaxHTMLIndex extends SaxDocFileParser{
 		}else {
 			// Clean-up using the props files
 			tempStrBuf.append("\\ba\\b");
-			Iterator it = cleanUpList.iterator();
-			while (it.hasNext()){
-				tempStrBuf.append("|\\b").append(it.next()).append("\\b");
-			}
+            for (String aCleanUp : cleanUpList) {
+                tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b");
+            }
 		}
 		if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){
 			tempCharBuf.append("\\u3002");
-			Iterator it = cleanUpPunctuation.iterator();
-			while (it.hasNext()){
-				tempCharBuf.append("|"+it.next());
-			}
+            for (String aCleanUpPunctuation : cleanUpPunctuation) {
+                tempCharBuf.append("|").append(aCleanUpPunctuation);
+            }
 		}
 
 		str = minimalClean(str, tempStrBuf, tempCharBuf);
diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java
index 44f67041e..5c487e9f3 100755
--- a/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java
+++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java
@@ -16,12 +16,12 @@ public class BlankRemover
         return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", "");
     }
 
-    /* replace multiple whitespaces between words with single blank */
+    /* replace multiple whitespace between words with single blank */
     public static String itrim(String source) {
         return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");
     }
 
-    /* remove all superfluous whitespaces in source string */
+    /* remove all superfluous whitespace in source string */
     public static String rmWhiteSpace(String source) {
 		//System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");
         return (source==null)? null : itrim(ltrim(rtrim(source)));
diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java
index 21538404c..e24cda8e0 100755
--- a/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java
+++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java
@@ -13,34 +13,34 @@ public class DirList {
 	String [] topicFiles = null;
    	public static final int MAX_DEPTH = 10;
     
-  public DirList(File inputdir, String regex, int depth) {
+  public DirList(File inputDir, String regexp, int depth) {
     try {
       
       listFiles = new ArrayList<File> ();
     	
     // not yet implemented	
-      if(regex == null) {
-          for (File f: inputdir.listFiles()) {
+      if(regexp == null) {
+          for (File f: inputDir.listFiles()) {
         	  if (!f.isDirectory()){
         		  listFiles.add(f);
         	  }else {
         		  if (depth < MAX_DEPTH ) {
-           			DirList nsiDoc = new DirList(f,regex,depth+1);
+           			DirList nsiDoc = new DirList(f,regexp,depth+1);
          			listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
         		  }
         	  }
           }
       }
       else {
-          for (File f: inputdir.listFiles(new DirFilter(regex))) {
+          for (File f: inputDir.listFiles(new DirFilter(regexp))) {
         	  listFiles.add(f);
           }
 // Patch from Oxygen to address problem where directories
 // containing . were not traversed.
-          for (File f: inputdir.listFiles(new DirFilter(".*"))) {
+          for (File f: inputDir.listFiles(new DirFilter(".*"))) {
         	  if (f.isDirectory()){
         		  if (depth < MAX_DEPTH ) {
-        			DirList nsiDoc = new DirList(f,regex, depth+1);
+        			DirList nsiDoc = new DirList(f,regexp, depth+1);
          			listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
         		  }
         	  }
-- 
2.40.0