From d0dd98c4c702af9b29372c303844c4c6c14b4109 Mon Sep 17 00:00:00 2001 From: Kasun Gajasinghe Date: Fri, 9 Sep 2011 17:52:26 +0000 Subject: [PATCH] optimizing the code --- .../com/nexwave/nquindexer/IndexerMain.java | 3 +- .../com/nexwave/nquindexer/IndexerTask.java | 372 ------------------ .../nexwave/nquindexer/SaxDocFileParser.java | 223 +++++------ .../com/nexwave/nquindexer/SaxHTMLIndex.java | 107 ++--- .../src/com/nexwave/nsidita/BlankRemover.java | 4 +- .../src/com/nexwave/nsidita/DirList.java | 14 +- 6 files changed, 172 insertions(+), 551 deletions(-) delete mode 100755 xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java index 8b94207ba..63cbe9c93 100644 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java @@ -66,7 +66,7 @@ public class IndexerMain { /** * The content language defaults to English "en" - * @param htmlDir The directory where html files resides. + * @param htmlDir The directory where html files reside. */ public IndexerMain(String htmlDir) { super(); @@ -338,7 +338,6 @@ public class IndexerMain { System.out.println("Delay = " + diff / 1000 + " seconds"); } else { System.out.println(txt_wrong_dita_basedir); - return; } } diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java deleted file mode 100755 index 373e89d01..000000000 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerTask.java +++ /dev/null @@ -1,372 +0,0 @@ -/* -package com.nexwave.nquindexer; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.Date; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; -import java.util.Properties; - -import org.apache.tools.ant.BuildException; -import org.apache.tools.ant.Task; - - -import com.nexwave.nsidita.DirList; -import com.nexwave.nsidita.DocFileInfo; - -*/ -/** - * Indexer ant task. - * - * @version 1.0 2008-02-26 - * - * @author N. Quaine - * @author Kasun Gajasinghe - *//* - -public class IndexerTask extends Task { - - // messages - private String txt_no_inputdir = "Input directory not found:"; - private String txt_cannot_create_outputdir = "Cannot create output search directory."; - private String txt_no_files_found = "No html files found."; - private String txt_wrong_dita_basedir = "ERROR: Parser initialization failed. Wrong dita base dir"; - private String txt_no_relative_files_found= "No relative html files calculated."; - private String txt_no_words_gathered= "No words have been indexed in"; - private String txt_no_html_files="No HTML Files found in"; - private String txt_no_args="No argument given: you must provide an htmlDir to the IndexerTask"; - - //working directories - private String searchdir = "search"; - private File inputDir = null; - private String outputDir = null; - private String projectDir = null; - - // ANT parameters - private String htmlDir=null; - public static String indexerLanguage="en"; - - //supported languages: add new additions to this. don't include country codes to the end such as en_US or en_UK, - // as stemmers doesn't find a difference between them. - private String[] supportedLanguages= {"en", "de", "fr", "zh", "ja", "ko"}; //currently extended support available for - // English, German, French and CJK (Chinese [zh], Japanese [ja], Korean [ko]) languages only. - - // Indexing features: words to remove - private ArrayList cleanUpStrings = null; - private ArrayList cleanUpChars = null; - - //Html extension - private String htmlExtension = "html"; - - // Constructor - public IndexerTask() { - super(); - } - */ -/** The setter for the "htmlDir" attribute (parameter of the task) - * @param htmldir - * @throws InterruptedException - *//* - - public void setHtmlDir(String htmlDir) { - this.htmlDir = htmlDir; - } - - */ -/** - * Set the extension in which html files are generated - * @param htmlExtension The extension in wich html files are generated - *//* - - public void setHtmlextension(String htmlExtension) { - this.htmlExtension = htmlExtension; - //Trim the starting "." - if(this.htmlExtension.startsWith(".")) { - this.htmlExtension = this.htmlExtension.substring(1); - } - } - - */ -/** - * setter for "indexerLanguage" attribute from ANT - * @param indexerLanguage language for the search indexer. Used to differerentiate which stemmer to be used. - * @throws InterruptedException for ant - *//* - - public void setIndexerLanguage(String indexerLanguage){ - if(indexerLanguage !=null && !"".equals(indexerLanguage)) { - int temp = indexerLanguage.indexOf('_'); - if( temp != -1){ - indexerLanguage = indexerLanguage.substring(0,temp); - } - int i=0; - for (;i=supportedLanguages.length){ -// System.out.println("The given language, \""+indexerLanguage+"\", does not have extensive support for " + -// "searching. Check documentation for details. "); - IndexerTask.indexerLanguage = indexerLanguage; - } - } else { - IndexerTask.indexerLanguage = "@@"; //fail-safe mechanism, This vm should not reach this point. - } - } - - */ -/** - * Implementation of the execute function (Task interface) - *//* - - public void execute() throws BuildException { - try{ - //Use Xerces as the parser. Does not support Saxon6.5.5 parser - System.setProperty("org.xml.sax.driver", "org.apache.xerces.parsers.SAXParser"); - System.setProperty("javax.xml.parsers.SAXParserFactory", "org.apache.xerces.jaxp.SAXParserFactoryImpl"); -// System.setProperty("org.xml.sax.driver", "com.icl.saxon.aelfred.SAXDriver"); -// System.setProperty("javax.xml.parsers.SAXParserFactory", "com.icl.saxon.aelfred.SAXParserFactoryImpl"); - } catch (SecurityException se){ - System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " + - "is not in your CLASSPATH."); - } catch (Exception e){ - System.out.println("[WARNING] Default parser is not set to Xerces. Make sure Saxon6.5.5 " + - "is not in your CLASSPATH"); - } - - ArrayList filesDescription = null; // list of information about the topic files - ArrayList htmlFiles = null; // topic files listed in the given directory - ArrayList htmlFilesPathRel = null; - Map tempDico = new HashMap(); - Iterator it; - - //File name initialization - String htmlList = "htmlFileList.js"; - String htmlInfoList = "htmlFileInfoList.js"; - String indexName = ".js"; - - //timing - Date dateStart = new Date(); - - if (htmlDir == null) { - System.out.println(txt_no_args + "."); - return; - } - // Init input directory - inputDir = new File(htmlDir); - - // Begin of init - // check if inputdir initialized - if (inputDir == null) { - DisplayHelp(); - return; - } - - // check if inputdir exists - if (!inputDir.exists()) { - System.out.println(txt_no_inputdir + " "+ inputDir + "."); - return; - } - - // check if outputdir defined - if (outputDir == null) { - //set the output directory: path= {inputDir}/search - outputDir = inputDir.getPath().concat(File.separator).concat(searchdir); - } - - // check if outputdir exists - File tempfile = new File(outputDir); - if (!tempfile.exists()) { - boolean b = (new File(outputDir)).mkdir(); - if (!b) { - System.out.println(txt_cannot_create_outputdir + " "+ outputDir + "."); - return; - } - } - - // check if projdir is defined - if (projectDir == null) { - projectDir = inputDir.getPath(); - } - //end of init - - - // Get the list of all html files but the tocs, covers and indexes - DirList nsiDoc = new DirList(inputDir, "^.*\\." + htmlExtension + "?$", 1); - htmlFiles = nsiDoc.getListFiles(); - // Check if found html files - if (htmlFiles.isEmpty()) { - System.out.println(txt_no_html_files + " "+ inputDir + "."); - return; - } - // Get the list of all html files with relative paths - htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir); - - if (htmlFiles == null) { - System.out.println(txt_no_files_found); - return; - } else if (htmlFilesPathRel == null) { - System.out.println(txt_no_relative_files_found); - return; - } - - // Create the list of the existing html files (index starts at 0) - WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel); - - // Parse each html file to retrieve the words: - // ------------------------------------------ - - // Retrieve the clean-up properties for indexing - RetrieveCleanUpProps(); - // System.out.print("clean"+" " +cleanUpStrings); - - //create a default handler - //SaxHTMLIndex spe = new SaxHTMLIndex (); // do not use clean-up props files - //SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings); // use clean-up props files - SaxHTMLIndex spe = new SaxHTMLIndex (cleanUpStrings, cleanUpChars); // use clean-up props files - - if ( spe.init(tempDico) == 0 ) { - - //create a html file description list - filesDescription = new ArrayList (); - - it = htmlFiles.iterator ( ) ; - - // parse each html files - while ( it.hasNext ( ) ) { - File ftemp = (File) it.next(); - //tempMap.put(key, value); - //The HTML file information are added in the list of FileInfoObject - DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage)); - - ftemp = docFileInfoTemp.getFullpath(); - String stemp = ftemp.toString(); - int i = stemp.indexOf(projectDir); - if ( i != 0 ) { - System.out.println("the documentation root does not match with the documentation input!"); - return; - } - int ad = 1; - if (stemp.equals(projectDir)) ad = 0; - stemp = stemp.substring(i+projectDir.length()+ad); //i is redundant (i==0 always) - ftemp = new File (stemp); - docFileInfoTemp.setFullpath(ftemp); - - filesDescription.add(docFileInfoTemp); - } - */ -/*remove empty strings from the map*//* - - if (tempDico.containsKey("")) { - tempDico.remove(""); - } - // write the index files - if (tempDico.isEmpty()) { - System.out.println(txt_no_words_gathered + " "+ inputDir + "."); - return; - } - - WriteJSFiles.WriteIndex(outputDir.concat(File.separator).concat(indexName), tempDico); - - // write the html list file with title and shortdesc - //create the list of the existing html files (index starts at 0) - WriteJSFiles.WriteHTMLInfoList(outputDir.concat(File.separator).concat(htmlInfoList), filesDescription); - - //perf measurement - Date dateEnd = new Date(); - long diff = dateEnd.getTime() - dateStart.getTime(); - if(diff<1000) - System.out.println("Delay = " + diff + " milliseconds"); - else - System.out.println("Delay = " + diff/1000 + " seconds"); - }else { - System.out.println(txt_wrong_dita_basedir); - return; - } - } - - */ -/** - * Prints the usage information for this class to System.out. - *//* - - private static void DisplayHelp() { - String lSep = System.getProperty("line.separator"); - StringBuffer msg = new StringBuffer(); - msg.append("USAGE:" + lSep); - msg.append(" java -classpath TesterIndexer inputDir outputDir projectDir" + lSep); - msg.append("with:" + lSep); - msg.append(" inputDir (mandatory) : specify the html files ' directory to index" + lSep); - msg.append(" outputDir (optional) : specify where to output the index files" + lSep); - msg.append(" projectDir (optional) : specify the root of the documentation directory" + lSep); - msg.append("Example:" + lSep); - msg.append(" java -classpath TesterIndexer /home/$USER/DITA/doc" + lSep); - msg.append("Example 2:" + lSep); - msg.append(" java -classpath TesterIndexer /home/$USER/DITA/doc/customer/concepts /home/$USER/temp/search /home/$USER/DITA/doc/" + lSep); - System.out.println(msg.toString()); - } - private int RetrieveCleanUpProps (){ - - // Files for punctuation (only one for now) - String[] punctuationFiles = new String[] {"punctuation.props"}; - FileInputStream input; - String tempStr; - File ftemp; - Collection c = new ArrayList(); - - // Get the list of the props file containing the words to remove (not the punctuation) - DirList props = new DirList(inputDir, "^(?!(punctuation)).*\\.props$", 1); - ArrayList wordsList = props.getListFiles(); -// System.out.println("props files:"+wordsList); - //TODO all properties are taken to a single arraylist. does it ok?. - Properties enProps =new Properties (); - String propsDir = inputDir.getPath().concat(File.separator).concat(searchdir); - - // Init the lists which will contain the words and chars to remove - cleanUpStrings = new ArrayList(); - cleanUpChars = new ArrayList(); - - try { - // Retrieve words to remove - for (File aWordsList : wordsList) { - ftemp = aWordsList; - if (ftemp.exists()) { - enProps.load(input = new FileInputStream(ftemp.getAbsolutePath())); - input.close(); - c = enProps.values(); - cleanUpStrings.addAll(c); - enProps.clear(); - } - } - - // Retrieve char to remove (punctuation for ex.) - for (String punctuationFile : punctuationFiles) { - tempStr = propsDir.concat(File.separator).concat(punctuationFile); - ftemp = new File(tempStr); - if (ftemp.exists()) { - enProps.load(input = new FileInputStream(tempStr)); - input.close(); - c = enProps.values(); - cleanUpChars.addAll(c); - enProps.clear(); - } - } - } - catch (IOException e) { - e.printStackTrace(); - return 1; - } - return 0; - } - -} -*/ diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java index ca808d529..a24cc1855 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java @@ -13,14 +13,14 @@ import org.xml.sax.SAXParseException; /** * Generic parser for populating a DocFileInfo object. - * + * * @version 2.0 2010-08-14 - * + * * @author N. Quaine * @author Kasun Gajasinghe */ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { - + //members protected DocFileInfo fileDesc = null; protected String projectDir = null; @@ -39,7 +39,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { public SaxDocFileParser () { } - + /** * Initializer */ @@ -48,16 +48,16 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { } /** - * Parses the file to extract all the words for indexing and - * some data characterizing the file. - * @param file contains the fullpath of the document to parse + * Parses the file to extract all the words for indexing and + * some data characterizing the file. + * @param file contains the fullpath of the document to parse * @return a DitaFileInfo object filled with data describing the file */ public DocFileInfo runExtractData(File file) { //initialization fileDesc = new DocFileInfo(file); strbf = new StringBuffer(""); - + // Fill strbf by parsing the file parseDocument(file); @@ -67,7 +67,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { public void parseDocument (File file) { // System.out.println(System.getProperty("org.xml.sax.driver")); // System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory")); - + //get a factory javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance(); @@ -83,7 +83,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { //parse the file and also register this class for call backs //System.out.println("Parsing: " + file); - + long start = System.currentTimeMillis(); //System.out.println("about to parse " + file.getName() + " >>> " + start); @@ -93,25 +93,25 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { is.setSystemId(file.toURI().toURL().toString()); sp.parse(is, this); } - + long finish = System.currentTimeMillis(); //System.out.println("done parsing " + file.getName() + " >>> " + finish); //System.out.println("time = " + (finish - start) + " milliseconds"); - + }catch(SAXParseException spe){ System.out.println("SaxParseException: The indexing file contains incorrect xml syntax."); spe.printStackTrace(); }catch(org.xml.sax.SAXException se) { System.out.println("SaxException. You may need to include Xerces in your classpath. " + "See documentation for details"); - se.printStackTrace(); + se.printStackTrace(); }catch(javax.xml.parsers.ParserConfigurationException pce) { pce.printStackTrace(); }catch (IOException ie) { ie.printStackTrace(); } } - + private boolean addContent = false; private boolean addHeaderInfo = false; private boolean doNotIndex=false; @@ -129,26 +129,26 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { if((qName.equalsIgnoreCase("meta")) ) { addHeaderInfo = true; String attrName = attributes.getValue("name"); - // OXYGEN PATCH START EXM-20576 - add scoring for keywords - if(attrName != null && (attrName.equalsIgnoreCase("keywords") - || attrName.equalsIgnoreCase("description") - || attrName.equalsIgnoreCase("indexterms") - )){ - if (attrName.equalsIgnoreCase("keywords")) { - String[] keywords = attributes.getValue("content").split(", "); - for (int i = 0; i < keywords.length; i++) { - strbf.append(" " + keywords[i] + "@@@elem_meta_keywords@@@ "); - } - } else if (attrName.equalsIgnoreCase("indexterms")) { - String[] indexterms = attributes.getValue("content").split(", "); - for (int i = 0; i < indexterms.length; i++) { - strbf.append(" " + indexterms[i] + "@@@elem_meta_indexterms@@@ "); - } - } else { - strbf.append(" " + attributes.getValue("content") + " "); - } - } - // OXYGEN PATCH END EXM-20576 - add scoring for indexterms + // OXYGEN PATCH START EXM-20576 - add scoring for keywords + if(attrName != null && (attrName.equalsIgnoreCase("keywords") + || attrName.equalsIgnoreCase("description") + || attrName.equalsIgnoreCase("indexterms") + )){ + if (attrName.equalsIgnoreCase("keywords")) { + String[] keywords = attributes.getValue("content").split(", "); + for (String keyword : keywords) { + strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ "); + } + } else if (attrName.equalsIgnoreCase("indexterms")) { + String[] indexterms = attributes.getValue("content").split(", "); + for (String indexterm : indexterms) { + strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ "); + } + } else { + strbf.append(" ").append(attributes.getValue("content") ).append(" "); + } + } + // OXYGEN PATCH END EXM-20576 - add scoring for indexterms // dwc: adding this to make the docbook element // (which becomes in html) // into the brief description that shows up in search @@ -163,13 +163,9 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { tempVal = new StringBuffer(); } - if(qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc")){ - addHeaderInfo = true; - } else { - addHeaderInfo = false; - } + addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc"); - String elementId = attributes.getValue("id"); + String elementId = attributes.getValue("id"); if("content".equals(elementId)) addContent = true; if(addContent) { @@ -193,11 +189,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { } String accessKey = attributes.getValue("accesskey"); - if(accessKey!=null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey))){ - doNotIndex = true; - } else { - doNotIndex = false; - } + doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey)); } strbf.append(" "); } @@ -207,7 +199,7 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { // index certain elements. E.g. Use this to implement a // "titles only" index, - + //OXYGEN PATCH, gather more keywords. if( // (addContent || addHeaderInfo) && @@ -221,64 +213,64 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { // Do a minimal clean text = minimalClean(text, null, null); text = text.replaceAll("\\s+"," "); - String marker = "@@@elem_" + stack.peek() + "@@@ "; - Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text); + String marker = "@@@elem_" + stack.peek() + "@@@ "; + Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text); if (text.trim().length() > 0 && m.find()) { - String copyText = new String(originalText); - text = duplicateWords(copyText, text, "-"); - copyText = new String(originalText); - text = duplicateWords(copyText, text, ":"); - copyText = new String(originalText); - text = duplicateWords(copyText, text, "."); + String copyText = new String(originalText); + text = duplicateWords(copyText, text, "-"); + copyText = new String(originalText); + text = duplicateWords(copyText, text, ":"); + copyText = new String(originalText); + text = duplicateWords(copyText, text, "."); // Replace whitespace with the marker text = text.replace(" ", marker); text = text + marker; } // END OXYGEN PATCH strbf.append(text); -// System.out.println("=== marked text: " + text); +// System.out.println("=== marked text: " + text); // START OXYGEN PATCH, append the original text if (tempVal != null) { tempVal.append(originalText);} // END OXYGEN PATCH } } - - // START OXYGEN PATCH EXM-20414 - private String duplicateWords(String sourceText, String acumulator, String separator) { -// System.out.println("sourceText: " + sourceText + " separator: " + separator); - int index = sourceText.indexOf(separator); - while (index >= 0) { - int indexSpaceAfter = sourceText.indexOf(" ", index); - String substring = null; - if (indexSpaceAfter >= 0) { - substring = sourceText.substring(0, indexSpaceAfter); - sourceText = sourceText.substring(indexSpaceAfter); - } else { - substring = sourceText; - sourceText = ""; - } - - int indexSpaceBefore = substring.lastIndexOf(" "); - if (indexSpaceBefore >= 0) { - substring = substring.substring(indexSpaceBefore + 1); - } - if (separator.indexOf(".") >= 0) { - separator = separator.replaceAll("\\.", "\\\\."); -// System.out.println("++++++++++ separator: " + separator); - } - String[] tokens = substring.split(separator); - - for (int i = 0; i < tokens.length; i++) { - acumulator = acumulator + " " + tokens[i]; -// System.out.println("added token: " + tokens[i] + " new text: " + acumulator); - } - - index = sourceText.indexOf(separator); - } - - return acumulator; - } - // END OXYGEN PATCH EXM-20414 + + // START OXYGEN PATCH EXM-20414 + private String duplicateWords(String sourceText, String acumulator, String separator) { +// System.out.println("sourceText: " + sourceText + " separator: " + separator); + int index = sourceText.indexOf(separator); + while (index >= 0) { + int indexSpaceAfter = sourceText.indexOf(" ", index); + String substring = null; + if (indexSpaceAfter >= 0) { + substring = sourceText.substring(0, indexSpaceAfter); + sourceText = sourceText.substring(indexSpaceAfter); + } else { + substring = sourceText; + sourceText = ""; + } + + int indexSpaceBefore = substring.lastIndexOf(" "); + if (indexSpaceBefore >= 0) { + substring = substring.substring(indexSpaceBefore + 1); + } + if (separator.indexOf(".") >= 0) { + separator = separator.replaceAll("\\.", "\\\\."); +// System.out.println("++++++++++ separator: " + separator); + } + String[] tokens = substring.split(separator); + + for (String token : tokens) { + acumulator = acumulator + " " + token; +// System.out.println("added token: " + tokens[i] + " new text: " + acumulator); + } + + index = sourceText.indexOf(separator); + } + + return acumulator; + } + // END OXYGEN PATCH EXM-20414 public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { // START OXYGEN PATCH, remove element from stack stack.pop(); @@ -292,31 +284,31 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { else if (shortdescBool) { shortTagCpt --; if (shortTagCpt == 0) { - String shortdesc = tempVal.toString().replace('\n', ' '); - if(shortdesc.trim().length() > 0) { - fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc)); - } + String shortdesc = tempVal.toString().replace('\n', ' '); + if(shortdesc.trim().length() > 0) { + fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc)); + } tempVal = null; shortdescBool = false; } } - + if(qName.equalsIgnoreCase("div") && addContent){ divCount--; if (divCount == 0) { addContent = false; } - } + } } - + public void processingInstruction(String target, String data) throws org.xml.sax.SAXException { //do nothing - + } - - /*public InputSource resolveEntity(String publicId, String systemId) + + /*public InputSource resolveEntity(String publicId, String systemId) throws IOException, SAXException { - + // use the catalog to solve the doctype System.out.println("entities " + publicId + systemId); return null; @@ -325,13 +317,13 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { throws org.xml.sax.SAXException, IOException { //System.out.println("Entities " + publicId + "and" + systemId); // use dita ot (dost.jar) for resolving dtd paths using the calatog - + return null; } /** - * Removes the validation in html files, such as xml version and DTDs - * @param file + * Removes the validation in html files, such as xml version and DTDs + * @param file the html file * @return int: returns 0 if no IOException occurs, else 1. */ public String RemoveValidationPI (File file) { @@ -348,36 +340,35 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { int i1, i2; boolean ok = true; try { - + String line = br.readLine(); - - + if (line == null) { break; } //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*"); - + line = line.replaceAll("\\x26nbsp\\x3B", " "); - + if (!line.contains("", i1); while (i2 < 0) { - + line = line.concat(br.readLine()); i2 = line.indexOf(">", i1); } String temp = line.substring(i1, i2); - + //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*"); if (line.contains(" cleanUpStrings) { super(); cleanUpList = cleanUpStrings; } /** * Constructor - */ + * @param cleanUpStrings + * @param cleanUpChars + */ public SaxHTMLIndex (ArrayList cleanUpStrings, ArrayList cleanUpChars) { super(); cleanUpList = cleanUpStrings; @@ -108,7 +111,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{ /** * Initializer - */ + * @param tempMap + */ public int init(Map tempMap){ tempDico = tempMap; return 0; @@ -142,8 +146,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{ //Do Stemming for words in items //TODO currently, stemming support is for english and german only. Add support for other languages as well. - // START OXYGEN PATCH - wsList = new ArrayList(); + // START OXYGEN PATCH + wsList = new ArrayList(); // START OXYGEN PATCH, create the words and scoring list // String[] tokenizedItems; // END OXYGEN PATCH @@ -151,8 +155,8 @@ public class SaxHTMLIndex extends SaxDocFileParser{ || indexerLanguage.equalsIgnoreCase("ko")){ LinkedList tokens = new LinkedList(); try{ - //EXM-21501 Oxygen patch, replace the extra "@@@"s. - str = str.replaceAll("@@@([^\\s]*)@@@", ""); + //EXM-21501 Oxygen patch, replace the extra "@@@"s. + str = str.replaceAll("@@@([^\\s]*)@@@", ""); CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30); Reader reader = new StringReader(str); TokenStream stream = analyzer.tokenStream("", reader); @@ -162,23 +166,23 @@ public class SaxHTMLIndex extends SaxDocFileParser{ while (stream.incrementToken()) { String term = termAtt.term(); tokens.add(term); - WordAndScoring ws = new WordAndScoring(term, term, 1); - boolean found = false; - for (int i = 0; i < wsList.size(); i++) { - // If the stem of the current word is already in list, - // do not add the word in the list, just recompute scoring - if (wsList.get(i).getStem().equals(ws.getStem())) { - found = true; - int scoring = wsList.get(i).getScoring(); - wsList.get(i).setScoring(scoring + ws.getScoring()); - break; + WordAndScoring ws = new WordAndScoring(term, term, 1); + boolean found = false; + for (WordAndScoring aWsList : wsList) { + // If the stem of the current word is already in list, + // do not add the word in the list, just recompute scoring + if (aWsList.getStem().equals(ws.getStem())) { + found = true; + int scoring = aWsList.getScoring(); + aWsList.setScoring(scoring + ws.getScoring()); + break; + } + + } + if (!found) { + wsList.add(ws); + } } - - } - if (!found) { - wsList.add(ws); - } - } // START OXYGEN PATCH //tokenizedItems = tokens.toArray(new String[tokens.size()]); // END OXYGEN PATCH @@ -199,7 +203,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{ } else if (indexerLanguage.equalsIgnoreCase("fr")){ stemmer= new FrenchStemmer(); } else { - stemmer = null;//Languages which stemming is not yet supproted.So, No stemmers will be used. + stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used. } // START OXYGEN PATCH wsList = new ArrayList(); @@ -210,16 +214,16 @@ public class SaxHTMLIndex extends SaxDocFileParser{ WordAndScoring ws = getWordAndScoring(token, stemmer, stem); if (ws != null) { boolean found = false; - for (int i = 0; i < wsList.size(); i++) { - // If the stem of the current word is already in list, - // do not add the word in the list, just recompute scoring - if (wsList.get(i).getStem().equals(ws.getStem())) { - found = true; - int scoring = wsList.get(i).getScoring(); - wsList.get(i).setScoring(scoring + ws.getScoring()); - break; - } - } + for (WordAndScoring aWsList : wsList) { + // If the stem of the current word is already in list, + // do not add the word in the list, just recompute scoring + if (aWsList.getStem().equals(ws.getStem())) { + found = true; + int scoring = aWsList.getScoring(); + aWsList.setScoring(scoring + ws.getScoring()); + break; + } + } if (!found) { wsList.add(ws); } @@ -256,10 +260,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{ ; //System.out.println("temp="+s+"="+temp); tempDico.put(s.getStem(), temp); - }else { - String temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring())); - tempDico.put(s.getStem(), temp); - } + }else if (s != null) { + String temp = null; + temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring())); + tempDico.put(s.getStem(), temp); + } // END OXYGEN PATCH } @@ -301,10 +306,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{ scoring = SCORING_FOR_ITALIC; } else if ("strong".equalsIgnoreCase(elementName)) { scoring = SCORING_FOR_BOLD; - } else if ("meta_keywords".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_KEYWORD; - } else if ("meta_indexterms".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_INDEXTERM; + } else if ("meta_keywords".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_KEYWORD; + } else if ("meta_indexterms".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_INDEXTERM; } // Get the stemmed word String stemWord = word; @@ -363,17 +368,15 @@ public class SaxHTMLIndex extends SaxDocFileParser{ }else { // Clean-up using the props files tempStrBuf.append("\\ba\\b"); - Iterator it = cleanUpList.iterator(); - while (it.hasNext()){ - tempStrBuf.append("|\\b").append(it.next()).append("\\b"); - } + for (String aCleanUp : cleanUpList) { + tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b"); + } } if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){ tempCharBuf.append("\\u3002"); - Iterator it = cleanUpPunctuation.iterator(); - while (it.hasNext()){ - tempCharBuf.append("|"+it.next()); - } + for (String aCleanUpPunctuation : cleanUpPunctuation) { + tempCharBuf.append("|").append(aCleanUpPunctuation); + } } str = minimalClean(str, tempStrBuf, tempCharBuf); diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java index 44f67041e..5c487e9f3 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java +++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java @@ -16,12 +16,12 @@ public class BlankRemover return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", ""); } - /* replace multiple whitespaces between words with single blank */ + /* replace multiple whitespace between words with single blank */ public static String itrim(String source) { return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " "); } - /* remove all superfluous whitespaces in source string */ + /* remove all superfluous whitespace in source string */ public static String rmWhiteSpace(String source) { //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'"); return (source==null)? null : itrim(ltrim(rtrim(source))); diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java index 21538404c..e24cda8e0 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java +++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java @@ -13,34 +13,34 @@ public class DirList { String [] topicFiles = null; public static final int MAX_DEPTH = 10; - public DirList(File inputdir, String regex, int depth) { + public DirList(File inputDir, String regexp, int depth) { try { listFiles = new ArrayList (); // not yet implemented - if(regex == null) { - for (File f: inputdir.listFiles()) { + if(regexp == null) { + for (File f: inputDir.listFiles()) { if (!f.isDirectory()){ listFiles.add(f); }else { if (depth < MAX_DEPTH ) { - DirList nsiDoc = new DirList(f,regex,depth+1); + DirList nsiDoc = new DirList(f,regexp,depth+1); listFiles.addAll(new ArrayList(nsiDoc.getListFiles())); } } } } else { - for (File f: inputdir.listFiles(new DirFilter(regex))) { + for (File f: inputDir.listFiles(new DirFilter(regexp))) { listFiles.add(f); } // Patch from Oxygen to address problem where directories // containing . were not traversed. - for (File f: inputdir.listFiles(new DirFilter(".*"))) { + for (File f: inputDir.listFiles(new DirFilter(".*"))) { if (f.isDirectory()){ if (depth < MAX_DEPTH ) { - DirList nsiDoc = new DirList(f,regex, depth+1); + DirList nsiDoc = new DirList(f,regexp, depth+1); listFiles.addAll(new ArrayList(nsiDoc.getListFiles())); } } -- 2.40.0