From 41e5d320aa6ee39711bca3983c59a58da0e76112 Mon Sep 17 00:00:00 2001 From: Kasun Gajasinghe Date: Fri, 9 Sep 2011 18:48:56 +0000 Subject: [PATCH] Proper formats for java files before start implementing the new features --- .../nexwave/nquindexer/IndexerConstants.java | 9 +- .../com/nexwave/nquindexer/IndexerMain.java | 114 ++-- .../nexwave/nquindexer/SaxDocFileParser.java | 646 +++++++++--------- .../com/nexwave/nquindexer/SaxHTMLIndex.java | 464 +++++++------ .../com/nexwave/nquindexer/TesterIndexer.java | 10 +- .../com/nexwave/nquindexer/WriteJSFiles.java | 103 +-- .../src/com/nexwave/nsidita/BlankRemover.java | 19 +- .../src/com/nexwave/nsidita/DirList.java | 177 ++--- .../src/com/nexwave/nsidita/DocFileInfo.java | 105 +-- 9 files changed, 838 insertions(+), 809 deletions(-) diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerConstants.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerConstants.java index 407c5b585..aeb736bd0 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerConstants.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerConstants.java @@ -2,15 +2,14 @@ package com.nexwave.nquindexer; /** * Constants used for the indexer. - * - * @version 2.0 2008-02-26 - * + * * @author N. Quaine + * @version 2.0 2008-02-26 */ public abstract class IndexerConstants { // European punctuation - public static final String EUPUNCTUATION1 = "[$|%,;'()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+"; - public static final String EUPUNCTUATION2 = "[$,;'()\\/*\"{}=!&+<>\\\\]"; + public static final String EUPUNCTUATION1 = "[$|%,;'()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+"; + public static final String EUPUNCTUATION2 = "[$,;'()\\/*\"{}=!&+<>\\\\]"; // Japanese punctuation public static final String JPPUNCTUATION1 = "\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D"; public static final String JPPUNCTUATION2 = "\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F"; diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java index 63cbe9c93..0f16a865d 100644 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/IndexerMain.java @@ -10,8 +10,10 @@ import java.util.*; /** * Main class of Stand-alone version of WebHelpIndexer + * * User: Kasun Gajasinghe, University of Moratuwa, http://kasunbg.blogspot.com * Date: Feb 10, 2011 + * * @author Kasun Gajasinghe */ @@ -26,8 +28,8 @@ public class IndexerMain { private String txt_no_words_gathered = "No words have been indexed in"; private String txt_no_html_files = "No HTML Files found in"; private String txt_no_args = "No argument given: you must provide an htmlDir to the IndexerMain"; - - private static String txt_no_lang_specified ="Language of the content is not specified. Defaults to English."; + + private static String txt_no_lang_specified = "Language of the content is not specified. Defaults to English."; //working directories private String searchdir = "search"; @@ -51,13 +53,14 @@ public class IndexerMain { //Html extension private String htmlExtension = "html"; - // OXYGEN PATCH START - //Table of contents file name - private String tocfile; - private boolean stem; - // OXYGEN PATCH END + // OXYGEN PATCH START + //Table of contents file name + private String tocfile; + private boolean stem; + // OXYGEN PATCH END // Constructors + public IndexerMain(String htmlDir, String indexerLanguage) { super(); setHtmlDir(htmlDir); @@ -65,7 +68,8 @@ public class IndexerMain { } /** - * The content language defaults to English "en" + * The content language defaults to English "en" + * * @param htmlDir The directory where html files reside. */ public IndexerMain(String htmlDir) { @@ -142,13 +146,13 @@ public class IndexerMain { indexer = new IndexerMain(args[0]); } else if (args.length >= 2) { - indexer = new IndexerMain(args[0], args[1]); - } else { - throw new RuntimeException("Please specify the parameters htmlDirectory and " + - "indexerLanguage (optional). \n "+ + indexer = new IndexerMain(args[0], args[1]); + } else { + throw new RuntimeException("Please specify the parameters htmlDirectory and " + + "indexerLanguage (optional). \n " + "ex: java -jar webhelpindexer.jar docs/content en \n" + "The program will exit now." - ); + ); } indexer.execute(); @@ -242,15 +246,15 @@ public class IndexerMain { // Get the list of all html files with relative paths htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir); - // OXYGEN PATCH START. - // Remove the table of content file - Iterator iterator = htmlFilesPathRel.iterator(); - while (iterator.hasNext()) { - if (iterator.next().endsWith(tocfile + "." + htmlExtension)) { - iterator.remove(); - } - } - // OXYGEN PATCH END + // OXYGEN PATCH START. + // Remove the table of content file + Iterator iterator = htmlFilesPathRel.iterator(); + while (iterator.hasNext()) { + if (iterator.next().endsWith(tocfile + "." + htmlExtension)) { + iterator.remove(); + } + } + // OXYGEN PATCH END if (htmlFiles == null) { System.out.println(txt_no_files_found); return; @@ -260,7 +264,7 @@ public class IndexerMain { } // Create the list of the existing html files (index starts at 0) - WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel, stem); + WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel, stem); // Parse each html file to retrieve the words: // ------------------------------------------ @@ -284,34 +288,34 @@ public class IndexerMain { // parse each html files while (it.hasNext()) { File ftemp = (File) it.next(); - // OXYGEN PATCH START. Remove table of content file - if (!ftemp.getAbsolutePath().endsWith(tocfile + "." + htmlExtension)) { - // OXYGEN PATCH END - //tempMap.put(key, value); - //The HTML file information are added in the list of FileInfoObject - DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage, stem)); - - ftemp = docFileInfoTemp.getFullpath(); - String stemp = ftemp.toString(); - int i = stemp.indexOf(projectDir); - if (i != 0) { - System.out.println("the documentation root does not match with the documentation input!"); - return; + // OXYGEN PATCH START. Remove table of content file + if (!ftemp.getAbsolutePath().endsWith(tocfile + "." + htmlExtension)) { + // OXYGEN PATCH END + //tempMap.put(key, value); + //The HTML file information are added in the list of FileInfoObject + DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp, indexerLanguage, stem)); + + ftemp = docFileInfoTemp.getFullpath(); + String stemp = ftemp.toString(); + int i = stemp.indexOf(projectDir); + if (i != 0) { + System.out.println("the documentation root does not match with the documentation input!"); + return; + } + int ad = 1; + if (stemp.equals(projectDir)) ad = 0; + stemp = stemp.substring(i + projectDir.length() + ad); //i is redundant (i==0 always) + ftemp = new File(stemp); + docFileInfoTemp.setFullpath(ftemp); + + filesDescription.add(docFileInfoTemp); + // OXYGEN PATCH START + // Remove the table of content file + } else { + it.remove(); } - int ad = 1; - if (stemp.equals(projectDir)) ad = 0; - stemp = stemp.substring(i + projectDir.length() + ad); //i is redundant (i==0 always) - ftemp = new File(stemp); - docFileInfoTemp.setFullpath(ftemp); - - filesDescription.add(docFileInfoTemp); - // OXYGEN PATCH START - // Remove the table of content file - } else { - it.remove(); + // OXYGEN PATCH END } - // OXYGEN PATCH END - } /*remove empty strings from the map*/ if (tempDico.containsKey("")) { tempDico.remove(""); @@ -414,14 +418,16 @@ public class IndexerMain { return 0; } - // OXYGEN PATCH START - // Set table of content file + // OXYGEN PATCH START + // Set table of content file + public void setTocfile(String tocfile) { - this.tocfile = tocfile; + this.tocfile = tocfile; } // If true then generate js files with stemming words + public void setStem(boolean stem) { - this.stem = stem; + this.stem = stem; } - // OXYGEN PATCH END + // OXYGEN PATCH END } diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java index a24cc1855..a415d268e 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxDocFileParser.java @@ -14,164 +14,166 @@ import org.xml.sax.SAXParseException; /** * Generic parser for populating a DocFileInfo object. * - * @version 2.0 2010-08-14 - * * @author N. Quaine * @author Kasun Gajasinghe + * @version 2.0 2010-08-14 */ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { - //members - protected DocFileInfo fileDesc = null; - protected String projectDir = null; - protected StringBuffer strbf = null; - private String currentElName = ""; - private StringBuffer tempVal = null; - private boolean shortdescBool = false; - private int shortTagCpt = 0; - - // OXYGEN PATCH. Keep the stack of elements - Stack stack = new Stack(); - //methods - /** - * Constructor - */ - public SaxDocFileParser () { - - } - - /** - * Initializer - */ - public int init(String inputDir){ - return 0; - } - - /** - * Parses the file to extract all the words for indexing and - * some data characterizing the file. - * @param file contains the fullpath of the document to parse - * @return a DitaFileInfo object filled with data describing the file - */ - public DocFileInfo runExtractData(File file) { - //initialization - fileDesc = new DocFileInfo(file); - strbf = new StringBuffer(""); - - // Fill strbf by parsing the file - parseDocument(file); - - return fileDesc; - } - - public void parseDocument (File file) { + //members + protected DocFileInfo fileDesc = null; + protected String projectDir = null; + protected StringBuffer strbf = null; + private String currentElName = ""; + private StringBuffer tempVal = null; + private boolean shortdescBool = false; + private int shortTagCpt = 0; + + // OXYGEN PATCH. Keep the stack of elements + Stack stack = new Stack(); + //methods + + /** + * Constructor + */ + public SaxDocFileParser() { + + } + + /** + * Initializer + */ + public int init(String inputDir) { + return 0; + } + + /** + * Parses the file to extract all the words for indexing and + * some data characterizing the file. + * + * @param file contains the fullpath of the document to parse + * @return a DitaFileInfo object filled with data describing the file + */ + public DocFileInfo runExtractData(File file) { + //initialization + fileDesc = new DocFileInfo(file); + strbf = new StringBuffer(""); + + // Fill strbf by parsing the file + parseDocument(file); + + return fileDesc; + } + + public void parseDocument(File file) { // System.out.println(System.getProperty("org.xml.sax.driver")); // System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory")); - //get a factory - javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance(); + //get a factory + javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance(); - spf.setValidating(false); + spf.setValidating(false); addContent = false; - divCount = 0; - try { - //get a new instance of parser - javax.xml.parsers.SAXParser sp = spf.newSAXParser(); - // deactivate the validation - sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false); - sp.getXMLReader().setFeature( "http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + divCount = 0; + try { + //get a new instance of parser + javax.xml.parsers.SAXParser sp = spf.newSAXParser(); + // deactivate the validation + sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false); + sp.getXMLReader().setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); //parse the file and also register this class for call backs - //System.out.println("Parsing: " + file); + //System.out.println("Parsing: " + file); - long start = System.currentTimeMillis(); - //System.out.println("about to parse " + file.getName() + " >>> " + start); + long start = System.currentTimeMillis(); + //System.out.println("about to parse " + file.getName() + " >>> " + start); - String content = RemoveValidationPI (file); - if (content != null){ - InputSource is = new InputSource(new StringReader(content)); - is.setSystemId(file.toURI().toURL().toString()); - sp.parse(is, this); - } + String content = RemoveValidationPI(file); + if (content != null) { + InputSource is = new InputSource(new StringReader(content)); + is.setSystemId(file.toURI().toURL().toString()); + sp.parse(is, this); + } - long finish = System.currentTimeMillis(); - //System.out.println("done parsing " + file.getName() + " >>> " + finish); - //System.out.println("time = " + (finish - start) + " milliseconds"); + long finish = System.currentTimeMillis(); + //System.out.println("done parsing " + file.getName() + " >>> " + finish); + //System.out.println("time = " + (finish - start) + " milliseconds"); - }catch(SAXParseException spe){ + } catch (SAXParseException spe) { System.out.println("SaxParseException: The indexing file contains incorrect xml syntax."); spe.printStackTrace(); - }catch(org.xml.sax.SAXException se) { - System.out.println("SaxException. You may need to include Xerces in your classpath. " + + } catch (org.xml.sax.SAXException se) { + System.out.println("SaxException. You may need to include Xerces in your classpath. " + "See documentation for details"); - se.printStackTrace(); - }catch(javax.xml.parsers.ParserConfigurationException pce) { - pce.printStackTrace(); - }catch (IOException ie) { - ie.printStackTrace(); - } - } + se.printStackTrace(); + } catch (javax.xml.parsers.ParserConfigurationException pce) { + pce.printStackTrace(); + } catch (IOException ie) { + ie.printStackTrace(); + } + } private boolean addContent = false; private boolean addHeaderInfo = false; - private boolean doNotIndex=false; + private boolean doNotIndex = false; private int divCount = 0; - //SAX parser Event Handlers: - public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException { + //SAX parser Event Handlers: + + public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException { - //dwc: capture current element name - // START OXYGEN PATCH, add current element in stack - stack.add(qName); - // END OXYGEN PATCH - currentElName = qName; + //dwc: capture current element name + // START OXYGEN PATCH, add current element in stack + stack.add(qName); + // END OXYGEN PATCH + currentElName = qName; - // dwc: Adding contents of some meta tags to the index - if((qName.equalsIgnoreCase("meta")) ) { + // dwc: Adding contents of some meta tags to the index + if ((qName.equalsIgnoreCase("meta"))) { addHeaderInfo = true; - String attrName = attributes.getValue("name"); - // OXYGEN PATCH START EXM-20576 - add scoring for keywords - if(attrName != null && (attrName.equalsIgnoreCase("keywords") - || attrName.equalsIgnoreCase("description") - || attrName.equalsIgnoreCase("indexterms") - )){ - if (attrName.equalsIgnoreCase("keywords")) { - String[] keywords = attributes.getValue("content").split(", "); + String attrName = attributes.getValue("name"); + // OXYGEN PATCH START EXM-20576 - add scoring for keywords + if (attrName != null && (attrName.equalsIgnoreCase("keywords") + || attrName.equalsIgnoreCase("description") + || attrName.equalsIgnoreCase("indexterms") + )) { + if (attrName.equalsIgnoreCase("keywords")) { + String[] keywords = attributes.getValue("content").split(", "); for (String keyword : keywords) { strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ "); } - } else if (attrName.equalsIgnoreCase("indexterms")) { - String[] indexterms = attributes.getValue("content").split(", "); + } else if (attrName.equalsIgnoreCase("indexterms")) { + String[] indexterms = attributes.getValue("content").split(", "); for (String indexterm : indexterms) { strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ "); } - } else { - strbf.append(" ").append(attributes.getValue("content") ).append(" "); - } - } - // OXYGEN PATCH END EXM-20576 - add scoring for indexterms - // dwc: adding this to make the docbook element - // (which becomes in html) - // into the brief description that shows up in search - // results. - if(attrName != null && (attrName.equalsIgnoreCase("description"))){ - fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' '))); - } - } // dwc: End addition + } else { + strbf.append(" ").append(attributes.getValue("content")).append(" "); + } + } + // OXYGEN PATCH END EXM-20576 - add scoring for indexterms + // dwc: adding this to make the docbook element + // (which becomes in html) + // into the brief description that shows up in search + // results. + if (attrName != null && (attrName.equalsIgnoreCase("description"))) { + fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' '))); + } + } // dwc: End addition // dwc: commenting out DITA specific lines - if((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) { - tempVal = new StringBuffer(); - } + if ((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) { + tempVal = new StringBuffer(); + } addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc"); String elementId = attributes.getValue("id"); - if("content".equals(elementId)) addContent = true; + if ("content".equals(elementId)) addContent = true; - if(addContent) { + if (addContent) { //counts div tags starting from "content" div(inclusive). This will be used to track the end of content "div" tag. //see #endElement() - if(qName.equalsIgnoreCase("div")){ + if (qName.equalsIgnoreCase("div")) { divCount++; } @@ -191,74 +193,78 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { String accessKey = attributes.getValue("accesskey"); doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey)); } - strbf.append(" "); - } + strbf.append(" "); + } - //triggers when there's character data inside an element. - public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { + //triggers when there's character data inside an element. - // index certain elements. E.g. Use this to implement a - // "titles only" index, + public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException { + + // index certain elements. E.g. Use this to implement a + // "titles only" index, //OXYGEN PATCH, gather more keywords. - if( + if ( // (addContent || addHeaderInfo) && - !doNotIndex && !currentElName.equalsIgnoreCase("script")){ - String text = new String(ch,start,length); - // START OXYGEN PATCH, append a marker after each word - // The marker is used to compute the scoring - // Create the marker - String originalText = text.replaceAll("\\s+"," "); - text = text.trim(); - // Do a minimal clean - text = minimalClean(text, null, null); - text = text.replaceAll("\\s+"," "); - String marker = "@@@elem_" + stack.peek() + "@@@ "; - Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text); - if (text.trim().length() > 0 && m.find()) { - String copyText = new String(originalText); - text = duplicateWords(copyText, text, "-"); - copyText = new String(originalText); - text = duplicateWords(copyText, text, ":"); - copyText = new String(originalText); - text = duplicateWords(copyText, text, "."); - // Replace whitespace with the marker - text = text.replace(" ", marker); - text = text + marker; - } - // END OXYGEN PATCH - strbf.append(text); + !doNotIndex && !currentElName.equalsIgnoreCase("script")) { + String text = new String(ch, start, length); + // START OXYGEN PATCH, append a marker after each word + // The marker is used to compute the scoring + // Create the marker + String originalText = text.replaceAll("\\s+", " "); + text = text.trim(); + // Do a minimal clean + text = minimalClean(text, null, null); + text = text.replaceAll("\\s+", " "); + String marker = "@@@elem_" + stack.peek() + "@@@ "; + Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text); + if (text.trim().length() > 0 && m.find()) { + String copyText = new String(originalText); + text = duplicateWords(copyText, text, "-"); + copyText = new String(originalText); + text = duplicateWords(copyText, text, ":"); + copyText = new String(originalText); + text = duplicateWords(copyText, text, "."); + // Replace whitespace with the marker + text = text.replace(" ", marker); + text = text + marker; + } + // END OXYGEN PATCH + strbf.append(text); // System.out.println("=== marked text: " + text); - // START OXYGEN PATCH, append the original text - if (tempVal != null) { tempVal.append(originalText);} - // END OXYGEN PATCH - } - } - - // START OXYGEN PATCH EXM-20414 - private String duplicateWords(String sourceText, String acumulator, String separator) { + // START OXYGEN PATCH, append the original text + if (tempVal != null) { + tempVal.append(originalText); + } + // END OXYGEN PATCH + } + } + + // START OXYGEN PATCH EXM-20414 + + private String duplicateWords(String sourceText, String acumulator, String separator) { // System.out.println("sourceText: " + sourceText + " separator: " + separator); - int index = sourceText.indexOf(separator); - while (index >= 0) { - int indexSpaceAfter = sourceText.indexOf(" ", index); - String substring = null; - if (indexSpaceAfter >= 0) { - substring = sourceText.substring(0, indexSpaceAfter); - sourceText = sourceText.substring(indexSpaceAfter); - } else { - substring = sourceText; - sourceText = ""; - } - - int indexSpaceBefore = substring.lastIndexOf(" "); - if (indexSpaceBefore >= 0) { - substring = substring.substring(indexSpaceBefore + 1); - } - if (separator.indexOf(".") >= 0) { - separator = separator.replaceAll("\\.", "\\\\."); + int index = sourceText.indexOf(separator); + while (index >= 0) { + int indexSpaceAfter = sourceText.indexOf(" ", index); + String substring = null; + if (indexSpaceAfter >= 0) { + substring = sourceText.substring(0, indexSpaceAfter); + sourceText = sourceText.substring(indexSpaceAfter); + } else { + substring = sourceText; + sourceText = ""; + } + + int indexSpaceBefore = substring.lastIndexOf(" "); + if (indexSpaceBefore >= 0) { + substring = substring.substring(indexSpaceBefore + 1); + } + if (separator.indexOf(".") >= 0) { + separator = separator.replaceAll("\\.", "\\\\."); // System.out.println("++++++++++ separator: " + separator); - } - String[] tokens = substring.split(separator); + } + String[] tokens = substring.split(separator); for (String token : tokens) { acumulator = acumulator + " " + token; @@ -266,169 +272,167 @@ public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler { } index = sourceText.indexOf(separator); - } - - return acumulator; - } - // END OXYGEN PATCH EXM-20414 - public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { - // START OXYGEN PATCH, remove element from stack - stack.pop(); - // END OXYGEN PATCH - if(qName.equalsIgnoreCase("title")) { - //add it to the list - //myEmpls.add(tempEmp); - fileDesc.setTitle(BlankRemover.rmWhiteSpace(tempVal.toString())); - tempVal = null; - } - else if (shortdescBool) { - shortTagCpt --; - if (shortTagCpt == 0) { - String shortdesc = tempVal.toString().replace('\n', ' '); - if(shortdesc.trim().length() > 0) { - fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc)); - } - tempVal = null; - shortdescBool = false; - } - } - - if(qName.equalsIgnoreCase("div") && addContent){ + } + + return acumulator; + } + // END OXYGEN PATCH EXM-20414 + + public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException { + // START OXYGEN PATCH, remove element from stack + stack.pop(); + // END OXYGEN PATCH + if (qName.equalsIgnoreCase("title")) { + //add it to the list + //myEmpls.add(tempEmp); + fileDesc.setTitle(BlankRemover.rmWhiteSpace(tempVal.toString())); + tempVal = null; + } else if (shortdescBool) { + shortTagCpt--; + if (shortTagCpt == 0) { + String shortdesc = tempVal.toString().replace('\n', ' '); + if (shortdesc.trim().length() > 0) { + fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc)); + } + tempVal = null; + shortdescBool = false; + } + } + + if (qName.equalsIgnoreCase("div") && addContent) { divCount--; if (divCount == 0) { addContent = false; } } - } + } + + public void processingInstruction(String target, String data) throws org.xml.sax.SAXException { + //do nothing - public void processingInstruction(String target, String data) throws org.xml.sax.SAXException { - //do nothing + } - } + /*public InputSource resolveEntity(String publicId, String systemId) + throws IOException, SAXException { - /*public InputSource resolveEntity(String publicId, String systemId) - throws IOException, SAXException { + // use the catalog to solve the doctype + System.out.println("entities " + publicId + systemId); + return null; + }*/ - // use the catalog to solve the doctype - System.out.println("entities " + publicId + systemId); - return null; - }*/ - public org.xml.sax.InputSource resolveEntity(String publicId, String systemId) - throws org.xml.sax.SAXException, IOException { - //System.out.println("Entities " + publicId + "and" + systemId); - // use dita ot (dost.jar) for resolving dtd paths using the calatog + public org.xml.sax.InputSource resolveEntity(String publicId, String systemId) + throws org.xml.sax.SAXException, IOException { + //System.out.println("Entities " + publicId + "and" + systemId); + // use dita ot (dost.jar) for resolving dtd paths using the calatog - return null; - } + return null; + } /** * Removes the validation in html files, such as xml version and DTDs + * * @param file the html file * @return int: returns 0 if no IOException occurs, else 1. */ - public String RemoveValidationPI (File file) { + public String RemoveValidationPI(File file) { StringBuilder sb = new StringBuilder(); - //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will - // be the source to index the content of the particular html page. - try { - BufferedReader br = new BufferedReader( - new InputStreamReader( - new FileInputStream(file),"UTF-8")); - - while(true) - { - int i1, i2; - boolean ok = true; - try { - - String line = br.readLine(); - - if (line == null) { - break; - } - //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*"); - - line = line.replaceAll("\\x26nbsp\\x3B", " "); - - if (!line.contains("", i1); - while (i2 < 0) { - - line = line.concat(br.readLine()); - i2 = line.indexOf(">", i1); - } - String temp = line.substring(i1, i2); - - //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*"); - if (line.contains("", i1); + while (i2 < 0) { + + line = line.concat(br.readLine()); + i2 = line.indexOf(">", i1); + } + String temp = line.substring(i1, i2); + + //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*"); + if (line.contains("", " "); - str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " "); - str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " "); - str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " "); - str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " "); - str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " "); - if (tempPunctuation != null && tempPunctuation.length() > 0) - { - str = str.replaceAll(tempPunctuation, " "); - } - - if (tempStrBuf != null) { - //remove useless words - str = str.replaceAll(tempStrBuf.toString(), " "); - } - - // Redo punctuation after removing some words: (TODO: useful?) - str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " "); - str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " "); - str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " "); - str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " "); - str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " "); - if (tempPunctuation != null && tempPunctuation.length() > 0) - { - str = str.replaceAll(tempPunctuation, " "); - } return str; - } - // END OXYGEN PATCH + } + } + catch (IOException e) { + break; + } + } + + br.close(); + } + catch (IOException e) { + return null; + } + + return sb.toString(); // return status + + } + + // START OXYGEN PATCH, moved from subclass + + protected String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) { + String tempPunctuation = null; + if (tempCharBuf != null) { + tempPunctuation = new String(tempCharBuf); + } + + str = str.replaceAll("\\s+", " "); + str = str.replaceAll("->", " "); + str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " "); + str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " "); + str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " "); + str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " "); + str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " "); + if (tempPunctuation != null && tempPunctuation.length() > 0) { + str = str.replaceAll(tempPunctuation, " "); + } + + if (tempStrBuf != null) { + //remove useless words + str = str.replaceAll(tempStrBuf.toString(), " "); + } + + // Redo punctuation after removing some words: (TODO: useful?) + str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " "); + str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " "); + str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " "); + str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " "); + str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " "); + if (tempPunctuation != null && tempPunctuation.length() > 0) { + str = str.replaceAll(tempPunctuation, " "); + } + return str; + } + // END OXYGEN PATCH } diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java index c57bbf7cd..ba436e6d5 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/SaxHTMLIndex.java @@ -15,6 +15,7 @@ import com.nexwave.stemmer.snowball.ext.EnglishStemmer; import com.nexwave.stemmer.snowball.ext.FrenchStemmer; import com.nexwave.stemmer.snowball.ext.GermanStemmer; +//client-side support is yet to come for these stemmers import com.nexwave.stemmer.snowball.ext.danishStemmer; import com.nexwave.stemmer.snowball.ext.dutchStemmer; import com.nexwave.stemmer.snowball.ext.finnishStemmer; @@ -43,99 +44,106 @@ import org.apache.lucene.analysis.tokenattributes.TermAttribute; * NOTE: This indexes only the content under a tag with ID "content". * Wrap html content with a div tag with id "content" to index relevant parts of your page. * - * @version 2.0 2010 - * * @author N. Quaine * @author Kasun Gajasinghe + * @version 2.0 2010 */ -public class SaxHTMLIndex extends SaxDocFileParser{ +public class SaxHTMLIndex extends SaxDocFileParser { //KasunBG: apparently tempDico stores all the keywords and a pointer to the files containing the index in a Map //example: ("keyword1", "0,2,4"), ("docbook", "1,2,5") - private Map tempDico; - private int i = 0; - private ArrayList cleanUpList = null; - private ArrayList cleanUpPunctuation = null; - - // START OXYGEN PATCH, scoring for HTML elements - private int SCORING_FOR_H1 = 50; - private int SCORING_FOR_H2 = 45; - private int SCORING_FOR_H3 = 40; - private int SCORING_FOR_H4 = 35; - private int SCORING_FOR_H5 = 30; - private int SCORING_FOR_H6 = 25; - private int SCORING_FOR_BOLD = 5; - private int SCORING_FOR_ITALIC = 3; - private int SCORING_FOR_NORMAL_TEXT = 1; - private int SCORING_FOR_KEYWORD = 100; - private int SCORING_FOR_INDEXTERM = 75; - - /** - * The list with the word and scoring object - */ - private List wsList = null; - - /** - * Used for Oxygen TestCases - * @return the wsList - */ - public List getWsList() { - return wsList; - } - // END OXYGEN PATCH - //methods - /** - * Constructor - */ - public SaxHTMLIndex () { - super(); - } - /** - * Constructor + private Map tempDico; + private int i = 0; + private ArrayList cleanUpList = null; + private ArrayList cleanUpPunctuation = null; + + // START OXYGEN PATCH, scoring for HTML elements + private int SCORING_FOR_H1 = 50; + private int SCORING_FOR_H2 = 45; + private int SCORING_FOR_H3 = 40; + private int SCORING_FOR_H4 = 35; + private int SCORING_FOR_H5 = 30; + private int SCORING_FOR_H6 = 25; + private int SCORING_FOR_BOLD = 5; + private int SCORING_FOR_ITALIC = 3; + private int SCORING_FOR_NORMAL_TEXT = 1; + private int SCORING_FOR_KEYWORD = 100; + private int SCORING_FOR_INDEXTERM = 75; + + /** + * The list with the word and scoring object + */ + private List wsList = null; + + /** + * Used for Oxygen TestCases + * + * @return the wsList + */ + public List getWsList() { + return wsList; + } + // END OXYGEN PATCH + //methods + + /** + * Constructor + */ + public SaxHTMLIndex() { + super(); + } + + /** + * Constructor + * * @param cleanUpStrings */ - public SaxHTMLIndex (ArrayList cleanUpStrings) { - super(); - cleanUpList = cleanUpStrings; - } - /** - * Constructor + public SaxHTMLIndex(ArrayList cleanUpStrings) { + super(); + cleanUpList = cleanUpStrings; + } + + /** + * Constructor + * * @param cleanUpStrings * @param cleanUpChars */ - public SaxHTMLIndex (ArrayList cleanUpStrings, ArrayList cleanUpChars) { - super(); - cleanUpList = cleanUpStrings; - cleanUpPunctuation = cleanUpChars; - } - - /** - * Initializer + public SaxHTMLIndex(ArrayList cleanUpStrings, ArrayList cleanUpChars) { + super(); + cleanUpList = cleanUpStrings; + cleanUpPunctuation = cleanUpChars; + } + + /** + * Initializer + * * @param tempMap */ - public int init(Map tempMap){ - tempDico = tempMap; - return 0; - } - - /** - * Parses the file to extract all the words for indexing and - * some data characterizing the file. - * @param file contains the fullpath of the document to parse + public int init(Map tempMap) { + tempDico = tempMap; + return 0; + } + + /** + * Parses the file to extract all the words for indexing and + * some data characterizing the file. + * + * @param file contains the fullpath of the document to parse * @param indexerLanguage this will be used to tell the program which stemmer to be used. - * @param stem if true then generate js files with words stemmed - * @return a DitaFileInfo object filled with data describing the file - */ - public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) { - //initialization - fileDesc = new DocFileInfo(file); - strbf = new StringBuffer(""); - - // Fill strbf by parsing the file - parseDocument(file); - - String str = cleanBuffer(strbf); - str = str.replaceAll("\\s+"," "); //there's still redundant spaces in the middle + * @param stem if true then generate js files with words stemmed + * @return a DitaFileInfo object filled with data describing the file + */ + public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) { + //initialization + fileDesc = new DocFileInfo(file); + strbf = new StringBuffer(""); + + // Fill strbf by parsing the file + parseDocument(file); + + String str = cleanBuffer(strbf); + str = str.replaceAll("\\s+", " "); //there's still redundant spaces in the middle // System.out.println(file.toString()+" "+ str +"\n"); // START OXYGEN PATCH // String[] items = str.split("\\s"); //contains all the words in the array @@ -151,12 +159,12 @@ public class SaxHTMLIndex extends SaxDocFileParser{ // START OXYGEN PATCH, create the words and scoring list // String[] tokenizedItems; // END OXYGEN PATCH - if(indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh") - || indexerLanguage.equalsIgnoreCase("ko")){ - LinkedList tokens = new LinkedList(); - try{ - //EXM-21501 Oxygen patch, replace the extra "@@@"s. - str = str.replaceAll("@@@([^\\s]*)@@@", ""); + if (indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh") + || indexerLanguage.equalsIgnoreCase("ko")) { + LinkedList tokens = new LinkedList(); + try { + //EXM-21501 Oxygen patch, replace the extra "@@@"s. + str = str.replaceAll("@@@([^\\s]*)@@@", ""); CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30); Reader reader = new StringReader(str); TokenStream stream = analyzer.tokenStream("", reader); @@ -179,29 +187,29 @@ public class SaxHTMLIndex extends SaxDocFileParser{ } } - if (!found) { - wsList.add(ws); - } + if (!found) { + wsList.add(ws); + } } // START OXYGEN PATCH //tokenizedItems = tokens.toArray(new String[tokens.size()]); // END OXYGEN PATCH - }catch (IOException ex){ - // START OXYGEN PATCH + } catch (IOException ex) { + // START OXYGEN PATCH // tokenizedItems = items; - // END OXYGEN PATCH + // END OXYGEN PATCH System.out.println("Error tokenizing content using CJK Analyzer. IOException"); ex.printStackTrace(); } } else { SnowballStemmer stemmer; - if(indexerLanguage.equalsIgnoreCase("en")){ - stemmer = new EnglishStemmer(); - } else if (indexerLanguage.equalsIgnoreCase("de")){ - stemmer= new GermanStemmer(); - } else if (indexerLanguage.equalsIgnoreCase("fr")){ - stemmer= new FrenchStemmer(); + if (indexerLanguage.equalsIgnoreCase("en")) { + stemmer = new EnglishStemmer(); + } else if (indexerLanguage.equalsIgnoreCase("de")) { + stemmer = new GermanStemmer(); + } else if (indexerLanguage.equalsIgnoreCase("fr")) { + stemmer = new FrenchStemmer(); } else { stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used. } @@ -210,10 +218,10 @@ public class SaxHTMLIndex extends SaxDocFileParser{ StringTokenizer st = new StringTokenizer(str, " "); // Tokenize the string and populate the words and scoring list while (st.hasMoreTokens()) { - String token = st.nextToken(); - WordAndScoring ws = getWordAndScoring(token, stemmer, stem); - if (ws != null) { - boolean found = false; + String token = st.nextToken(); + WordAndScoring ws = getWordAndScoring(token, stemmer, stem); + if (ws != null) { + boolean found = false; for (WordAndScoring aWsList : wsList) { // If the stem of the current word is already in list, // do not add the word in the list, just recompute scoring @@ -224,11 +232,11 @@ public class SaxHTMLIndex extends SaxDocFileParser{ break; } } - if (!found) { - wsList.add(ws); - } - } - } + if (!found) { + wsList.add(ws); + } + } + } // if(stemmer != null) //If a stemmer available // tokenizedItems = stemmer.doStem(items.toArray(new String[0])); // else //if no stemmer available for the particular language @@ -237,7 +245,7 @@ public class SaxHTMLIndex extends SaxDocFileParser{ } - /* for(String stemmedItem: tokenizedItems){ + /* for(String stemmedItem: tokenizedItems){ System.out.print(stemmedItem+"| "); }*/ @@ -250,140 +258,142 @@ public class SaxHTMLIndex extends SaxDocFileParser{ Iterator it = wsList.iterator(); WordAndScoring s; while (it.hasNext()) { - s = it.next(); - // Do not add results from 'toc.html' - if (s != null && tempDico.containsKey(s.getStem())) { - String temp = tempDico.get(s.getStem()); - temp = temp.concat(",").concat(Integer.toString(i)) - // Concat also the scoring for the stem - .concat("*").concat(Integer.toString(s.getScoring())) - ; - //System.out.println("temp="+s+"="+temp); - tempDico.put(s.getStem(), temp); - }else if (s != null) { - String temp = null; - temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring())); - tempDico.put(s.getStem(), temp); + s = it.next(); + // Do not add results from 'toc.html' + if (s != null && tempDico.containsKey(s.getStem())) { + String temp = tempDico.get(s.getStem()); + temp = temp.concat(",").concat(Integer.toString(i)) + // Concat also the scoring for the stem + .concat("*").concat(Integer.toString(s.getScoring())) + ; + //System.out.println("temp="+s+"="+temp); + tempDico.put(s.getStem(), temp); + } else if (s != null) { + String temp = null; + temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring())); + tempDico.put(s.getStem(), temp); } - // END OXYGEN PATCH + // END OXYGEN PATCH } i++; - return fileDesc; - } - - // START OXYGEN PATCH - /** - * Get the word, stem and scoring for the given token. - * @param token The token to parse. - * @param stemmer The stemmer. - * @param doStemming If true then generate js files with words stemmed. - * @return the word, stem and scoring for the given token. - */ - private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) { - WordAndScoring wordScoring = null; - if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) { - // Extract the word from token - String word = token.substring(0, token.indexOf("@@@")); - if (word.length() > 0) { - // Extract the element name from token - String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@")); - // Compute scoring - int scoring = SCORING_FOR_NORMAL_TEXT; - if ("h1".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_H1; - } else if ("h2".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_H2; - } else if ("h3".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_H3; - } else if ("h4".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_H4; - } else if ("h5".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_H5; - } else if ("h6".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_H6; - } else if ("em".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_ITALIC; - } else if ("strong".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_BOLD; - } else if ("meta_keywords".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_KEYWORD; - } else if ("meta_indexterms".equalsIgnoreCase(elementName)) { - scoring = SCORING_FOR_INDEXTERM; - } - // Get the stemmed word - String stemWord = word; - if (stemmer != null && doStemming) { - stemWord = stemmer.doStem(word); - } - wordScoring = new WordAndScoring(word, stemWord, scoring); - } - } else { - // The token contains only the word - String stemWord = token; - // Stem the word - if (stemmer != null && doStemming) { - stemWord = stemmer.doStem(token); - } - wordScoring = new WordAndScoring(token, stemWord, SCORING_FOR_NORMAL_TEXT); - } - return wordScoring; - } - // END OXYGEN PATCH - - /** - * Cleans the string buffer containing all the text retrieved from - * the html file: remove punctuation, clean white spaces, remove the words - * which you do not want to index. - * NOTE: You may customize this function: - * This version takes into account english and japanese. Depending on your - * needs, - * you may have to add/remove some characters/words through props files - * or by modifying tte default code, - * you may want to separate the language processing (doc only in japanese, - * doc only in english, check the language metadata ...). - */ - private String cleanBuffer (StringBuffer strbf) { - String str = strbf.toString().toLowerCase(); - StringBuffer tempStrBuf = new StringBuffer(""); - StringBuffer tempCharBuf = new StringBuffer(""); - if ((cleanUpList == null) || (cleanUpList.isEmpty())){ - // Default clean-up - - // Should perhaps eliminate the words at the end of the table? - tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case - tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b"); - tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b"); - tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b"); - tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b"); - tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b"); + return fileDesc; + } + + // START OXYGEN PATCH + + /** + * Get the word, stem and scoring for the given token. + * + * @param token The token to parse. + * @param stemmer The stemmer. + * @param doStemming If true then generate js files with words stemmed. + * @return the word, stem and scoring for the given token. + */ + private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) { + WordAndScoring wordScoring = null; + if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) { + // Extract the word from token + String word = token.substring(0, token.indexOf("@@@")); + if (word.length() > 0) { + // Extract the element name from token + String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@")); + // Compute scoring + int scoring = SCORING_FOR_NORMAL_TEXT; + if ("h1".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_H1; + } else if ("h2".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_H2; + } else if ("h3".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_H3; + } else if ("h4".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_H4; + } else if ("h5".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_H5; + } else if ("h6".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_H6; + } else if ("em".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_ITALIC; + } else if ("strong".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_BOLD; + } else if ("meta_keywords".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_KEYWORD; + } else if ("meta_indexterms".equalsIgnoreCase(elementName)) { + scoring = SCORING_FOR_INDEXTERM; + } + // Get the stemmed word + String stemWord = word; + if (stemmer != null && doStemming) { + stemWord = stemmer.doStem(word); + } + wordScoring = new WordAndScoring(word, stemWord, scoring); + } + } else { + // The token contains only the word + String stemWord = token; + // Stem the word + if (stemmer != null && doStemming) { + stemWord = stemmer.doStem(token); + } + wordScoring = new WordAndScoring(token, stemWord, SCORING_FOR_NORMAL_TEXT); + } + return wordScoring; + } + // END OXYGEN PATCH + + /** + * Cleans the string buffer containing all the text retrieved from + * the html file: remove punctuation, clean white spaces, remove the words + * which you do not want to index. + * NOTE: You may customize this function: + * This version takes into account english and japanese. Depending on your + * needs, + * you may have to add/remove some characters/words through props files + * or by modifying tte default code, + * you may want to separate the language processing (doc only in japanese, + * doc only in english, check the language metadata ...). + */ + private String cleanBuffer(StringBuffer strbf) { + String str = strbf.toString().toLowerCase(); + StringBuffer tempStrBuf = new StringBuffer(""); + StringBuffer tempCharBuf = new StringBuffer(""); + if ((cleanUpList == null) || (cleanUpList.isEmpty())) { + // Default clean-up + + // Should perhaps eliminate the words at the end of the table? + tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case + tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b"); + tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b"); + tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b"); + tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b"); + tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b"); tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b"); - str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " "); + str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " "); - //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " "); - // remove contiguous white charaters - //nqu 25.01.2008 str = str.replaceAll("\\s+", " "); - }else { - // Clean-up using the props files - tempStrBuf.append("\\ba\\b"); + //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " "); + // remove contiguous white charaters + //nqu 25.01.2008 str = str.replaceAll("\\s+", " "); + } else { + // Clean-up using the props files + tempStrBuf.append("\\ba\\b"); for (String aCleanUp : cleanUpList) { - tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b"); + tempStrBuf.append("|\\b").append(aCleanUp).append("\\b"); } - } - if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){ - tempCharBuf.append("\\u3002"); + } + if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())) { + tempCharBuf.append("\\u3002"); for (String aCleanUpPunctuation : cleanUpPunctuation) { tempCharBuf.append("|").append(aCleanUpPunctuation); } - } + } - str = minimalClean(str, tempStrBuf, tempCharBuf); - return str; - } + str = minimalClean(str, tempStrBuf, tempCharBuf); + return str; + } - // OXYGEN PATCH, moved method in superclass + // OXYGEN PATCH, moved method in superclass // private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) { // String tempPunctuation = new String(tempCharBuf); // @@ -413,6 +423,6 @@ public class SaxHTMLIndex extends SaxDocFileParser{ // str = str.replaceAll(tempPunctuation, " "); // } return str; // } - // END OXYGEN PATCH + // END OXYGEN PATCH } diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/TesterIndexer.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/TesterIndexer.java index 1aff3e933..ed3ef269f 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/TesterIndexer.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/TesterIndexer.java @@ -4,9 +4,9 @@ package com.nexwave.nquindexer; */ /** * For running tests with the indexertask. - * + * * @version 2.0 2010-08-14 - * + * * @author N. Quaine * @author Kasun Gajasinghe *//* @@ -15,9 +15,9 @@ package com.nexwave.nquindexer; public static IndexerTask IT = null; */ /** - * @param args - * @throws InterruptedException - *//* + * @param args + * @throws InterruptedException + *//* public static void main(String[] args) throws InterruptedException { if (args.length != 0) { diff --git a/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java b/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java index 329a21d34..c34d4b819 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java +++ b/xsl-webhelpindexer/src/com/nexwave/nquindexer/WriteJSFiles.java @@ -18,9 +18,9 @@ import com.nexwave.nsidita.DocFileInfo; * Outputs the js files with: * - the list of html files and their description * - the words retrieved from the html files and their location - * + *

* 20110803: Adding improvements from Radu/Oxygen. - * + * * @author N. Quaine * @author Kasun Gajasinghe * @version 2.0 2010-08-13 @@ -30,12 +30,14 @@ public class WriteJSFiles { private static String txt_VM_encoding_not_supported = "This VM does not support the specified encoding."; private static String txt_indices_location = "The created index files are located in "; - /** Create a javascript array listing the html files with their paths relative to the project root - * @param fileO path and name of the file in which to output the list of html files - * @param list of the html files, relative to the doc root directory - * @param doStem If true then js files will generate words stemmed + /** + * Create a javascript array listing the html files with their paths relative to the project root + * + * @param fileO path and name of the file in which to output the list of html files + * @param list of the html files, relative to the doc root directory + * @param doStem If true then js files will generate words stemmed */ - public static void WriteHTMLList (String fileO,ArrayList list, boolean doStem) { + public static void WriteHTMLList(String fileO, ArrayList list, boolean doStem) { int i = 0; Iterator it; @@ -64,7 +66,7 @@ public class WriteJSFiles { i++; } - out.write("var doStem = " + doStem + ""); + out.write("var doStem = " + doStem + ""); out.flush(); // Don't forget to flush! out.close(); // System.out.println("the array of html is in " + fileO); @@ -79,8 +81,10 @@ public class WriteJSFiles { } - /** Create a javascript array listing the html files with + /** + * Create a javascript array listing the html files with * their paths relative to project root, their titles and shortdescs + * * @param fileO path and name of the file in which to output the list of html files * @param list of the html files, relative to the doc root directory */ @@ -120,27 +124,27 @@ public class WriteJSFiles { if (tempTitle != null) { tempTitle = tempTitle.replaceAll("\\s+", " "); tempTitle = tempTitle.replaceAll("['�\"]", " "); - //EXM-21239 Escape "\" - tempTitle = tempTitle.replaceAll("\\\\", "\\\\\\\\"); + //EXM-21239 Escape "\" + tempTitle = tempTitle.replaceAll("\\\\", "\\\\\\\\"); } if (tempShortdesc != null) { tempShortdesc = tempShortdesc.replaceAll("\\s+", " "); tempShortdesc = tempShortdesc.replaceAll("['�\"]", " "); - //EXM-21239 Escape "\" - tempShortdesc = tempShortdesc.replaceAll("\\\\", "\\\\\\\\"); + //EXM-21239 Escape "\" + tempShortdesc = tempShortdesc.replaceAll("\\\\", "\\\\\\\\"); } - if (tempShortdesc != null) { - String stripNonAlphabeticalChars = stripNonAlphabeticalChars(tempShortdesc); - //stripNonAlphabeticalChars = stripWords(stripNonAlphabeticalChars); - stripNonAlphabeticalChars = stripNonAlphabeticalChars + "..."; - out.write("fil[\""+i+"\"]"+"= \""+tempPath+"@@@"+tempTitle+"@@@"+stripNonAlphabeticalChars+"\";\n"); - i++; - }else{ - out.write("fil[\""+i+"\"]"+"= \""+tempPath+"@@@"+tempTitle+"@@@null"+"\";\n"); - i++; + if (tempShortdesc != null) { + String stripNonAlphabeticalChars = stripNonAlphabeticalChars(tempShortdesc); + //stripNonAlphabeticalChars = stripWords(stripNonAlphabeticalChars); + stripNonAlphabeticalChars = stripNonAlphabeticalChars + "..."; + out.write("fil[\"" + i + "\"]" + "= \"" + tempPath + "@@@" + tempTitle + "@@@" + stripNonAlphabeticalChars + "\";\n"); + i++; + } else { + out.write("fil[\"" + i + "\"]" + "= \"" + tempPath + "@@@" + tempTitle + "@@@null" + "\";\n"); + i++; - } + } } out.flush(); // Don't forget to flush! @@ -156,7 +160,9 @@ public class WriteJSFiles { } - /** Create javascript index files alphabetically. + /** + * Create javascript index files alphabetically. + * * @param fileOutStr contains the path and the suffix of the index files to create. * The first letter of the key is added to the given suffix. For example: e.g. a.js, b.js etc... * @param indexMap its keys are the indexed words and @@ -228,30 +234,31 @@ public class WriteJSFiles { } - /** - * Remove all non alphabetical chars from the end of a text. - * @param input The text who will be striped. - * @return The striped text. - */ - private static String stripNonAlphabeticalChars(String input) { - String output = input; - for (int i = input.length() - 1; i > 0 ; i--) { - char charAt = input.charAt(i); - int k = (int)charAt; - if ((k > 65 && k < 91) || (k > 97 && k < 123) || (k > 48 && k < 58)) { - return output; - } else { - output = output.substring(0, output.length() - 1); - } - } - return output; - } - - private static String stripWords(String input) { - int idx = input.lastIndexOf(" "); - if (idx != -1) { - return input.substring(0, idx); - } else { + /** + * Remove all non alphabetical chars from the end of a text. + * + * @param input The text who will be striped. + * @return The striped text. + */ + private static String stripNonAlphabeticalChars(String input) { + String output = input; + for (int i = input.length() - 1; i > 0; i--) { + char charAt = input.charAt(i); + int k = (int) charAt; + if ((k > 65 && k < 91) || (k > 97 && k < 123) || (k > 48 && k < 58)) { + return output; + } else { + output = output.substring(0, output.length() - 1); + } + } + return output; + } + + private static String stripWords(String input) { + int idx = input.lastIndexOf(" "); + if (idx != -1) { + return input.substring(0, idx); + } else { return input; } } diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java index 5c487e9f3..3eb9a0cc4 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java +++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/BlankRemover.java @@ -2,32 +2,31 @@ package com.nexwave.nsidita; //import java.util.regex.; -public class BlankRemover -{ +public class BlankRemover { /* remove leading whitespace */ public static String ltrim(String source) { - return (source==null)? null : source.replaceAll("^[\\s\u00A0]+", ""); + return (source == null) ? null : source.replaceAll("^[\\s\u00A0]+", ""); } /* remove trailing whitespace */ public static String rtrim(String source) { - - return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", ""); + + return (source == null) ? null : source.replaceAll("[\\s\u00A0]+$", ""); } /* replace multiple whitespace between words with single blank */ public static String itrim(String source) { - return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " "); + return (source == null) ? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " "); } /* remove all superfluous whitespace in source string */ public static String rmWhiteSpace(String source) { - //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'"); - return (source==null)? null : itrim(ltrim(rtrim(source))); + //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'"); + return (source == null) ? null : itrim(ltrim(rtrim(source))); } - public static String lrtrim(String source){ - return (source==null)? null : ltrim(rtrim(source)); + public static String lrtrim(String source) { + return (source == null) ? null : ltrim(rtrim(source)); } } diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java index e24cda8e0..1ab29d887 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java +++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/DirList.java @@ -7,98 +7,101 @@ import java.util.Iterator; import java.util.regex.*; public class DirList { - - ArrayList listFiles = null; - ArrayList listFilesRelTo = null; - String [] topicFiles = null; - public static final int MAX_DEPTH = 10; - - public DirList(File inputDir, String regexp, int depth) { - try { - - listFiles = new ArrayList (); - - // not yet implemented - if(regexp == null) { - for (File f: inputDir.listFiles()) { - if (!f.isDirectory()){ - listFiles.add(f); - }else { - if (depth < MAX_DEPTH ) { - DirList nsiDoc = new DirList(f,regexp,depth+1); - listFiles.addAll(new ArrayList(nsiDoc.getListFiles())); - } - } - } - } - else { - for (File f: inputDir.listFiles(new DirFilter(regexp))) { - listFiles.add(f); - } + + ArrayList listFiles = null; + ArrayList listFilesRelTo = null; + String[] topicFiles = null; + public static final int MAX_DEPTH = 10; + + public DirList(File inputDir, String regexp, int depth) { + try { + + listFiles = new ArrayList(); + + // not yet implemented + if (regexp == null) { + for (File f : inputDir.listFiles()) { + if (!f.isDirectory()) { + listFiles.add(f); + } else { + if (depth < MAX_DEPTH) { + DirList nsiDoc = new DirList(f, regexp, depth + 1); + listFiles.addAll(new ArrayList(nsiDoc.getListFiles())); + } + } + } + } else { + for (File f : inputDir.listFiles(new DirFilter(regexp))) { + listFiles.add(f); + } // Patch from Oxygen to address problem where directories // containing . were not traversed. - for (File f: inputDir.listFiles(new DirFilter(".*"))) { - if (f.isDirectory()){ - if (depth < MAX_DEPTH ) { - DirList nsiDoc = new DirList(f,regexp, depth+1); - listFiles.addAll(new ArrayList(nsiDoc.getListFiles())); - } - } - } - } - } - catch(Exception e) { - // TODO gerer exception - e.printStackTrace(); + for (File f : inputDir.listFiles(new DirFilter(".*"))) { + if (f.isDirectory()) { + if (depth < MAX_DEPTH) { + DirList nsiDoc = new DirList(f, regexp, depth + 1); + listFiles.addAll(new ArrayList(nsiDoc.getListFiles())); + } + } + } + } + } + catch (Exception e) { + // TODO gerer exception + e.printStackTrace(); + } + } + + public ArrayList getListFiles() { + return this.listFiles; + } + + /** + * Calculate the path of the files already listed relative to projectDir + * + * @param projectDir Root from where to calculate the relative path + * @return The list of files with their relative path + */ + public ArrayList getListFilesRelTo(String projectDir) { + Iterator it; + + if (this.listFiles == null) return null; + + listFilesRelTo = new ArrayList(); + it = this.listFiles.iterator(); + while (it.hasNext()) { + File ftemp = (File) it.next(); + String stemp = ftemp.getPath(); + int i = stemp.indexOf(projectDir); + if (i != 0) { + System.out.println("the documentation root does not match with the documentation input!"); + return null; + } + int ad = 1; + if (stemp.equals(projectDir)) ad = 0; + stemp = stemp.substring(i + projectDir.length() + ad); + listFilesRelTo.add(stemp); + } + return this.listFilesRelTo; } - } - - public ArrayList getListFiles() { - return this.listFiles; - } - /** - * Calculate the path of the files already listed relative to projectDir - * @param projectDir Root from where to calculate the relative path - * @return The list of files with their relative path - */ - public ArrayList getListFilesRelTo(String projectDir) { - Iterator it; - - if (this.listFiles == null) return null; - - listFilesRelTo = new ArrayList(); - it = this.listFiles.iterator ( ) ; - while ( it.hasNext ( ) ) { - File ftemp = (File) it.next(); - String stemp = ftemp.getPath(); - int i = stemp.indexOf(projectDir); - if ( i != 0 ) { - System.out.println("the documentation root does not match with the documentation input!"); - return null; - } - int ad = 1; - if (stemp.equals(projectDir)) ad = 0; - stemp = stemp.substring(i+projectDir.length()+ad); - listFilesRelTo.add(stemp); - } - return this.listFilesRelTo; - } } class DirFilter implements FilenameFilter { - private Pattern pattern; - public DirFilter(String regex) { - pattern = Pattern.compile(regex); - } - public boolean accept(File dir, String name) { - String thisname = new File(name).getName(); - //System.out.println("Testing: "+ thisname); - if(thisname.equals("index.html") || thisname.equals("ix01.html")){ - return false; - }else{ - // Strip path information, search for regex: - return pattern.matcher(new File(name).getName()).matches(); - } - } + private Pattern pattern; + + public DirFilter(String regex) { + pattern = Pattern.compile(regex); + } + + public boolean accept(File dir, String name) { + String thisname = new File(name).getName(); + //System.out.println("Testing: "+ thisname); + if (thisname.equals("index.html") || thisname.equals("ix01.html")) { + return false; + } else { + // Strip path information, search for regex: + return pattern.matcher(new File(name).getName()).matches(); + } + } } diff --git a/xsl-webhelpindexer/src/com/nexwave/nsidita/DocFileInfo.java b/xsl-webhelpindexer/src/com/nexwave/nsidita/DocFileInfo.java index a34fe0d6e..446a31678 100755 --- a/xsl-webhelpindexer/src/com/nexwave/nsidita/DocFileInfo.java +++ b/xsl-webhelpindexer/src/com/nexwave/nsidita/DocFileInfo.java @@ -1,62 +1,63 @@ package com.nexwave.nsidita; import java.io.File; + /** * Object for describing a dita or html file. - * - * @version 2.0 2010-08-14 - * + * * @author N. Quaine + * @version 2.0 2010-08-14 */ public class DocFileInfo { - File fullpath = null; - String title = null; - String shortdesc = null; - String relpathToDocRep = null; //relative path to doc repository (ex: tasks/nexbuilder) - String deltaPathToDocRep = null; // distance from the doc repository (ex: ../..) - - // default constructor - public DocFileInfo() { - } - - public DocFileInfo(File file) { - fullpath = file; - } - - public DocFileInfo(DocFileInfo info) { - this.fullpath = info.fullpath; - this.title = info.title; - this.shortdesc = info.shortdesc; - } - - public void setTitle (String title){ - this.title = title; - } - - public void setShortdesc (String shortDesc){ - this.shortdesc = shortDesc; - } - - /** - * @return the shortdesc - */ - public String getShortdesc() { - return shortdesc; - } - - /** - * @return the title - */ - public String getTitle() { - return title; - } - - public File getFullpath() { - return fullpath; - } - - public void setFullpath(File fullpath) { - this.fullpath = fullpath; - } + File fullpath = null; + String title = null; + String shortdesc = null; + String relpathToDocRep = null; //relative path to doc repository (ex: tasks/nexbuilder) + String deltaPathToDocRep = null; // distance from the doc repository (ex: ../..) + + // default constructor + + public DocFileInfo() { + } + + public DocFileInfo(File file) { + fullpath = file; + } + + public DocFileInfo(DocFileInfo info) { + this.fullpath = info.fullpath; + this.title = info.title; + this.shortdesc = info.shortdesc; + } + + public void setTitle(String title) { + this.title = title; + } + + public void setShortdesc(String shortDesc) { + this.shortdesc = shortDesc; + } + + /** + * @return the shortdesc + */ + public String getShortdesc() { + return shortdesc; + } + + /** + * @return the title + */ + public String getTitle() { + return title; + } + + public File getFullpath() { + return fullpath; + } + + public void setFullpath(File fullpath) { + this.fullpath = fullpath; + } } -- 2.40.0