\r
/**\r
* Constants used for the indexer.\r
- * \r
- * @version 2.0 2008-02-26\r
- * \r
+ *\r
* @author N. Quaine\r
+ * @version 2.0 2008-02-26\r
*/\r
public abstract class IndexerConstants {\r
// European punctuation\r
- public static final String EUPUNCTUATION1 = "[$|%,;'()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+";
- public static final String EUPUNCTUATION2 = "[$,;'()\\/*\"{}=!&+<>\\\\]";
+ public static final String EUPUNCTUATION1 = "[$|%,;'()\\/*\"{}=!&+<>#\\?]|\\[|\\]|[-][-]+";\r
+ public static final String EUPUNCTUATION2 = "[$,;'()\\/*\"{}=!&+<>\\\\]";\r
// Japanese punctuation\r
public static final String JPPUNCTUATION1 = "\\u3000|\\u3001|\\u3002|\\u3003|\\u3008|\\u3009|\\u300C|\\u300D";\r
public static final String JPPUNCTUATION2 = "\\u3013|\\u3014|\\u3015|\\u301C|\\u301D|\\u301E|\\u301F";\r
/**
* Main class of Stand-alone version of WebHelpIndexer
+ *
* User: Kasun Gajasinghe, University of Moratuwa, http://kasunbg.blogspot.com
* Date: Feb 10, 2011
+ *
* @author Kasun Gajasinghe
*/
private String txt_no_words_gathered = "No words have been indexed in";
private String txt_no_html_files = "No HTML Files found in";
private String txt_no_args = "No argument given: you must provide an htmlDir to the IndexerMain";
-
- private static String txt_no_lang_specified ="Language of the content is not specified. Defaults to English.";
+
+ private static String txt_no_lang_specified = "Language of the content is not specified. Defaults to English.";
//working directories
private String searchdir = "search";
//Html extension
private String htmlExtension = "html";
- // OXYGEN PATCH START
- //Table of contents file name
- private String tocfile;
- private boolean stem;
- // OXYGEN PATCH END
+ // OXYGEN PATCH START
+ //Table of contents file name
+ private String tocfile;
+ private boolean stem;
+ // OXYGEN PATCH END
// Constructors
+
public IndexerMain(String htmlDir, String indexerLanguage) {
super();
setHtmlDir(htmlDir);
}
/**
- * The content language defaults to English "en"
+ * The content language defaults to English "en"
+ *
* @param htmlDir The directory where html files reside.
*/
public IndexerMain(String htmlDir) {
indexer = new IndexerMain(args[0]);
} else if (args.length >= 2) {
- indexer = new IndexerMain(args[0], args[1]);
- } else {
- throw new RuntimeException("Please specify the parameters htmlDirectory and " +
- "indexerLanguage (optional). \n "+
+ indexer = new IndexerMain(args[0], args[1]);
+ } else {
+ throw new RuntimeException("Please specify the parameters htmlDirectory and " +
+ "indexerLanguage (optional). \n " +
"ex: java -jar webhelpindexer.jar docs/content en \n" +
"The program will exit now."
- );
+ );
}
indexer.execute();
// Get the list of all html files with relative paths
htmlFilesPathRel = nsiDoc.getListFilesRelTo(projectDir);
- // OXYGEN PATCH START.
- // Remove the table of content file
- Iterator<String> iterator = htmlFilesPathRel.iterator();
- while (iterator.hasNext()) {
- if (iterator.next().endsWith(tocfile + "." + htmlExtension)) {
- iterator.remove();
- }
- }
- // OXYGEN PATCH END
+ // OXYGEN PATCH START.
+ // Remove the table of content file
+ Iterator<String> iterator = htmlFilesPathRel.iterator();
+ while (iterator.hasNext()) {
+ if (iterator.next().endsWith(tocfile + "." + htmlExtension)) {
+ iterator.remove();
+ }
+ }
+ // OXYGEN PATCH END
if (htmlFiles == null) {
System.out.println(txt_no_files_found);
return;
}
// Create the list of the existing html files (index starts at 0)
- WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel, stem);
+ WriteJSFiles.WriteHTMLList(outputDir.concat(File.separator).concat(htmlList), htmlFilesPathRel, stem);
// Parse each html file to retrieve the words:
// ------------------------------------------
// parse each html files
while (it.hasNext()) {
File ftemp = (File) it.next();
- // OXYGEN PATCH START. Remove table of content file
- if (!ftemp.getAbsolutePath().endsWith(tocfile + "." + htmlExtension)) {
- // OXYGEN PATCH END
- //tempMap.put(key, value);
- //The HTML file information are added in the list of FileInfoObject
- DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp,indexerLanguage, stem));
-
- ftemp = docFileInfoTemp.getFullpath();
- String stemp = ftemp.toString();
- int i = stemp.indexOf(projectDir);
- if (i != 0) {
- System.out.println("the documentation root does not match with the documentation input!");
- return;
+ // OXYGEN PATCH START. Remove table of content file
+ if (!ftemp.getAbsolutePath().endsWith(tocfile + "." + htmlExtension)) {
+ // OXYGEN PATCH END
+ //tempMap.put(key, value);
+ //The HTML file information are added in the list of FileInfoObject
+ DocFileInfo docFileInfoTemp = new DocFileInfo(spe.runExtractData(ftemp, indexerLanguage, stem));
+
+ ftemp = docFileInfoTemp.getFullpath();
+ String stemp = ftemp.toString();
+ int i = stemp.indexOf(projectDir);
+ if (i != 0) {
+ System.out.println("the documentation root does not match with the documentation input!");
+ return;
+ }
+ int ad = 1;
+ if (stemp.equals(projectDir)) ad = 0;
+ stemp = stemp.substring(i + projectDir.length() + ad); //i is redundant (i==0 always)
+ ftemp = new File(stemp);
+ docFileInfoTemp.setFullpath(ftemp);
+
+ filesDescription.add(docFileInfoTemp);
+ // OXYGEN PATCH START
+ // Remove the table of content file
+ } else {
+ it.remove();
}
- int ad = 1;
- if (stemp.equals(projectDir)) ad = 0;
- stemp = stemp.substring(i + projectDir.length() + ad); //i is redundant (i==0 always)
- ftemp = new File(stemp);
- docFileInfoTemp.setFullpath(ftemp);
-
- filesDescription.add(docFileInfoTemp);
- // OXYGEN PATCH START
- // Remove the table of content file
- } else {
- it.remove();
+ // OXYGEN PATCH END
}
- // OXYGEN PATCH END
- }
/*remove empty strings from the map*/
if (tempDico.containsKey("")) {
tempDico.remove("");
return 0;
}
- // OXYGEN PATCH START
- // Set table of content file
+ // OXYGEN PATCH START
+ // Set table of content file
+
public void setTocfile(String tocfile) {
- this.tocfile = tocfile;
+ this.tocfile = tocfile;
}
// If true then generate js files with stemming words
+
public void setStem(boolean stem) {
- this.stem = stem;
+ this.stem = stem;
}
- // OXYGEN PATCH END
+ // OXYGEN PATCH END
}
/**\r
* Generic parser for populating a DocFileInfo object.\r
*\r
- * @version 2.0 2010-08-14\r
- *\r
* @author N. Quaine\r
* @author Kasun Gajasinghe\r
+ * @version 2.0 2010-08-14\r
*/\r
public class SaxDocFileParser extends org.xml.sax.helpers.DefaultHandler {\r
\r
- //members\r
- protected DocFileInfo fileDesc = null;\r
- protected String projectDir = null;\r
- protected StringBuffer strbf = null;\r
- private String currentElName = "";\r
- private StringBuffer tempVal = null;\r
- private boolean shortdescBool = false;\r
- private int shortTagCpt = 0;\r
-\r
- // OXYGEN PATCH. Keep the stack of elements\r
- Stack<String> stack = new Stack<String>();\r
- //methods\r
- /**\r
- * Constructor\r
- */\r
- public SaxDocFileParser () {\r
-\r
- }\r
-\r
- /**\r
- * Initializer\r
- */\r
- public int init(String inputDir){\r
- return 0;\r
- }\r
-\r
- /**\r
- * Parses the file to extract all the words for indexing and\r
- * some data characterizing the file.\r
- * @param file contains the fullpath of the document to parse\r
- * @return a DitaFileInfo object filled with data describing the file\r
- */\r
- public DocFileInfo runExtractData(File file) {\r
- //initialization\r
- fileDesc = new DocFileInfo(file);\r
- strbf = new StringBuffer("");\r
-\r
- // Fill strbf by parsing the file\r
- parseDocument(file);\r
-\r
- return fileDesc;\r
- }\r
-\r
- public void parseDocument (File file) {\r
+ //members\r
+ protected DocFileInfo fileDesc = null;\r
+ protected String projectDir = null;\r
+ protected StringBuffer strbf = null;\r
+ private String currentElName = "";\r
+ private StringBuffer tempVal = null;\r
+ private boolean shortdescBool = false;\r
+ private int shortTagCpt = 0;\r
+\r
+ // OXYGEN PATCH. Keep the stack of elements\r
+ Stack<String> stack = new Stack<String>();\r
+ //methods\r
+\r
+ /**\r
+ * Constructor\r
+ */\r
+ public SaxDocFileParser() {\r
+\r
+ }\r
+\r
+ /**\r
+ * Initializer\r
+ */\r
+ public int init(String inputDir) {\r
+ return 0;\r
+ }\r
+\r
+ /**\r
+ * Parses the file to extract all the words for indexing and\r
+ * some data characterizing the file.\r
+ *\r
+ * @param file contains the fullpath of the document to parse\r
+ * @return a DitaFileInfo object filled with data describing the file\r
+ */\r
+ public DocFileInfo runExtractData(File file) {\r
+ //initialization\r
+ fileDesc = new DocFileInfo(file);\r
+ strbf = new StringBuffer("");\r
+\r
+ // Fill strbf by parsing the file\r
+ parseDocument(file);\r
+\r
+ return fileDesc;\r
+ }\r
+\r
+ public void parseDocument(File file) {\r
// System.out.println(System.getProperty("org.xml.sax.driver"));\r
// System.out.println(System.getProperty("javax.xml.parsers.SAXParserFactory"));\r
\r
- //get a factory\r
- javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();\r
+ //get a factory\r
+ javax.xml.parsers.SAXParserFactory spf = javax.xml.parsers.SAXParserFactory.newInstance();\r
\r
- spf.setValidating(false);\r
+ spf.setValidating(false);\r
addContent = false;\r
- divCount = 0;\r
- try {\r
- //get a new instance of parser\r
- javax.xml.parsers.SAXParser sp = spf.newSAXParser();\r
- // deactivate the validation\r
- sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false);\r
- sp.getXMLReader().setFeature( "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);\r
+ divCount = 0;\r
+ try {\r
+ //get a new instance of parser\r
+ javax.xml.parsers.SAXParser sp = spf.newSAXParser();\r
+ // deactivate the validation\r
+ sp.getXMLReader().setFeature("http://xml.org/sax/features/external-general-entities", false);\r
+ sp.getXMLReader().setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);\r
\r
//parse the file and also register this class for call backs\r
- //System.out.println("Parsing: " + file);\r
+ //System.out.println("Parsing: " + file);\r
\r
- long start = System.currentTimeMillis();\r
- //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
+ long start = System.currentTimeMillis();\r
+ //System.out.println("about to parse " + file.getName() + " >>> " + start);\r
\r
- String content = RemoveValidationPI (file);\r
- if (content != null){\r
- InputSource is = new InputSource(new StringReader(content));\r
- is.setSystemId(file.toURI().toURL().toString());\r
- sp.parse(is, this);\r
- }\r
+ String content = RemoveValidationPI(file);\r
+ if (content != null) {\r
+ InputSource is = new InputSource(new StringReader(content));\r
+ is.setSystemId(file.toURI().toURL().toString());\r
+ sp.parse(is, this);\r
+ }\r
\r
- long finish = System.currentTimeMillis();\r
- //System.out.println("done parsing " + file.getName() + " >>> " + finish);\r
- //System.out.println("time = " + (finish - start) + " milliseconds");\r
+ long finish = System.currentTimeMillis();\r
+ //System.out.println("done parsing " + file.getName() + " >>> " + finish);\r
+ //System.out.println("time = " + (finish - start) + " milliseconds");\r
\r
- }catch(SAXParseException spe){\r
+ } catch (SAXParseException spe) {\r
System.out.println("SaxParseException: The indexing file contains incorrect xml syntax.");\r
spe.printStackTrace();\r
- }catch(org.xml.sax.SAXException se) {\r
- System.out.println("SaxException. You may need to include Xerces in your classpath. " +\r
+ } catch (org.xml.sax.SAXException se) {\r
+ System.out.println("SaxException. You may need to include Xerces in your classpath. " +\r
"See documentation for details");\r
- se.printStackTrace();\r
- }catch(javax.xml.parsers.ParserConfigurationException pce) {\r
- pce.printStackTrace();\r
- }catch (IOException ie) {\r
- ie.printStackTrace();\r
- }\r
- }\r
+ se.printStackTrace();\r
+ } catch (javax.xml.parsers.ParserConfigurationException pce) {\r
+ pce.printStackTrace();\r
+ } catch (IOException ie) {\r
+ ie.printStackTrace();\r
+ }\r
+ }\r
\r
private boolean addContent = false;\r
private boolean addHeaderInfo = false;\r
- private boolean doNotIndex=false;\r
+ private boolean doNotIndex = false;\r
private int divCount = 0;\r
- //SAX parser Event Handlers:\r
- public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException {\r
+ //SAX parser Event Handlers:\r
+\r
+ public void startElement(String uri, String localName, String qName, org.xml.sax.Attributes attributes) throws org.xml.sax.SAXException {\r
\r
- //dwc: capture current element name\r
- // START OXYGEN PATCH, add current element in stack\r
- stack.add(qName);\r
- // END OXYGEN PATCH\r
- currentElName = qName;\r
+ //dwc: capture current element name\r
+ // START OXYGEN PATCH, add current element in stack\r
+ stack.add(qName);\r
+ // END OXYGEN PATCH\r
+ currentElName = qName;\r
\r
- // dwc: Adding contents of some meta tags to the index\r
- if((qName.equalsIgnoreCase("meta")) ) {\r
+ // dwc: Adding contents of some meta tags to the index\r
+ if ((qName.equalsIgnoreCase("meta"))) {\r
addHeaderInfo = true;\r
- String attrName = attributes.getValue("name");\r
- // OXYGEN PATCH START EXM-20576 - add scoring for keywords\r
- if(attrName != null && (attrName.equalsIgnoreCase("keywords")\r
- || attrName.equalsIgnoreCase("description")\r
- || attrName.equalsIgnoreCase("indexterms")\r
- )){\r
- if (attrName.equalsIgnoreCase("keywords")) {\r
- String[] keywords = attributes.getValue("content").split(", ");\r
+ String attrName = attributes.getValue("name");\r
+ // OXYGEN PATCH START EXM-20576 - add scoring for keywords\r
+ if (attrName != null && (attrName.equalsIgnoreCase("keywords")\r
+ || attrName.equalsIgnoreCase("description")\r
+ || attrName.equalsIgnoreCase("indexterms")\r
+ )) {\r
+ if (attrName.equalsIgnoreCase("keywords")) {\r
+ String[] keywords = attributes.getValue("content").split(", ");\r
for (String keyword : keywords) {\r
strbf.append(" ").append(keyword).append("@@@elem_meta_keywords@@@ ");\r
}\r
- } else if (attrName.equalsIgnoreCase("indexterms")) {\r
- String[] indexterms = attributes.getValue("content").split(", ");\r
+ } else if (attrName.equalsIgnoreCase("indexterms")) {\r
+ String[] indexterms = attributes.getValue("content").split(", ");\r
for (String indexterm : indexterms) {\r
strbf.append(" ").append(indexterm).append("@@@elem_meta_indexterms@@@ ");\r
}\r
- } else {\r
- strbf.append(" ").append(attributes.getValue("content") ).append(" ");\r
- }\r
- }\r
- // OXYGEN PATCH END EXM-20576 - add scoring for indexterms\r
- // dwc: adding this to make the docbook <abstract> element\r
- // (which becomes <meta name="description".../> in html)\r
- // into the brief description that shows up in search\r
- // results.\r
- if(attrName != null && (attrName.equalsIgnoreCase("description"))){\r
- fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' ')));\r
- }\r
- } // dwc: End addition\r
+ } else {\r
+ strbf.append(" ").append(attributes.getValue("content")).append(" ");\r
+ }\r
+ }\r
+ // OXYGEN PATCH END EXM-20576 - add scoring for indexterms\r
+ // dwc: adding this to make the docbook <abstract> element\r
+ // (which becomes <meta name="description".../> in html)\r
+ // into the brief description that shows up in search\r
+ // results.\r
+ if (attrName != null && (attrName.equalsIgnoreCase("description"))) {\r
+ fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(attributes.getValue("content").replace('\n', ' ')));\r
+ }\r
+ } // dwc: End addition\r
\r
// dwc: commenting out DITA specific lines\r
- if((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) {\r
- tempVal = new StringBuffer();\r
- }\r
+ if ((qName.equalsIgnoreCase("title")) || (qName.equalsIgnoreCase("shortdesc"))) {\r
+ tempVal = new StringBuffer();\r
+ }\r
\r
addHeaderInfo = qName.equalsIgnoreCase("meta") || qName.equalsIgnoreCase("title") || qName.equalsIgnoreCase("shortdesc");\r
\r
String elementId = attributes.getValue("id");\r
- if("content".equals(elementId)) addContent = true;\r
+ if ("content".equals(elementId)) addContent = true;\r
\r
- if(addContent) {\r
+ if (addContent) {\r
//counts div tags starting from "content" div(inclusive). This will be used to track the end of content "div" tag.\r
//see #endElement()\r
- if(qName.equalsIgnoreCase("div")){\r
+ if (qName.equalsIgnoreCase("div")) {\r
divCount++;\r
}\r
\r
String accessKey = attributes.getValue("accesskey");\r
doNotIndex = accessKey != null && ("n".equals(accessKey) || "p".equals(accessKey) || "h".equals(accessKey));\r
}\r
- strbf.append(" ");\r
- }\r
+ strbf.append(" ");\r
+ }\r
\r
- //triggers when there's character data inside an element.\r
- public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
+ //triggers when there's character data inside an element.\r
\r
- // index certain elements. E.g. Use this to implement a\r
- // "titles only" index,\r
+ public void characters(char[] ch, int start, int length) throws org.xml.sax.SAXException {\r
+\r
+ // index certain elements. E.g. Use this to implement a\r
+ // "titles only" index,\r
\r
//OXYGEN PATCH, gather more keywords.\r
- if(\r
+ if (\r
// (addContent || addHeaderInfo) && \r
- !doNotIndex && !currentElName.equalsIgnoreCase("script")){\r
- String text = new String(ch,start,length);\r
- // START OXYGEN PATCH, append a marker after each word\r
- // The marker is used to compute the scoring\r
- // Create the marker\r
- String originalText = text.replaceAll("\\s+"," ");\r
- text = text.trim();\r
- // Do a minimal clean\r
- text = minimalClean(text, null, null);\r
- text = text.replaceAll("\\s+"," ");\r
- String marker = "@@@elem_" + stack.peek() + "@@@ ";\r
- Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);\r
- if (text.trim().length() > 0 && m.find()) {\r
- String copyText = new String(originalText);\r
- text = duplicateWords(copyText, text, "-");\r
- copyText = new String(originalText);\r
- text = duplicateWords(copyText, text, ":");\r
- copyText = new String(originalText);\r
- text = duplicateWords(copyText, text, ".");\r
- // Replace whitespace with the marker\r
- text = text.replace(" ", marker);\r
- text = text + marker;\r
- }\r
- // END OXYGEN PATCH\r
- strbf.append(text);\r
+ !doNotIndex && !currentElName.equalsIgnoreCase("script")) {\r
+ String text = new String(ch, start, length);\r
+ // START OXYGEN PATCH, append a marker after each word\r
+ // The marker is used to compute the scoring\r
+ // Create the marker\r
+ String originalText = text.replaceAll("\\s+", " ");\r
+ text = text.trim();\r
+ // Do a minimal clean\r
+ text = minimalClean(text, null, null);\r
+ text = text.replaceAll("\\s+", " ");\r
+ String marker = "@@@elem_" + stack.peek() + "@@@ ";\r
+ Matcher m = Pattern.compile("(\\w|-|:)+").matcher(text);\r
+ if (text.trim().length() > 0 && m.find()) {\r
+ String copyText = new String(originalText);\r
+ text = duplicateWords(copyText, text, "-");\r
+ copyText = new String(originalText);\r
+ text = duplicateWords(copyText, text, ":");\r
+ copyText = new String(originalText);\r
+ text = duplicateWords(copyText, text, ".");\r
+ // Replace whitespace with the marker\r
+ text = text.replace(" ", marker);\r
+ text = text + marker;\r
+ }\r
+ // END OXYGEN PATCH\r
+ strbf.append(text);\r
// System.out.println("=== marked text: " + text);\r
- // START OXYGEN PATCH, append the original text\r
- if (tempVal != null) { tempVal.append(originalText);}\r
- // END OXYGEN PATCH\r
- }\r
- }\r
-\r
- // START OXYGEN PATCH EXM-20414\r
- private String duplicateWords(String sourceText, String acumulator, String separator) {\r
+ // START OXYGEN PATCH, append the original text\r
+ if (tempVal != null) {\r
+ tempVal.append(originalText);\r
+ }\r
+ // END OXYGEN PATCH\r
+ }\r
+ }\r
+\r
+ // START OXYGEN PATCH EXM-20414\r
+\r
+ private String duplicateWords(String sourceText, String acumulator, String separator) {\r
// System.out.println("sourceText: " + sourceText + " separator: " + separator);\r
- int index = sourceText.indexOf(separator);\r
- while (index >= 0) {\r
- int indexSpaceAfter = sourceText.indexOf(" ", index);\r
- String substring = null;\r
- if (indexSpaceAfter >= 0) {\r
- substring = sourceText.substring(0, indexSpaceAfter);\r
- sourceText = sourceText.substring(indexSpaceAfter);\r
- } else {\r
- substring = sourceText;\r
- sourceText = "";\r
- }\r
-\r
- int indexSpaceBefore = substring.lastIndexOf(" ");\r
- if (indexSpaceBefore >= 0) {\r
- substring = substring.substring(indexSpaceBefore + 1);\r
- }\r
- if (separator.indexOf(".") >= 0) {\r
- separator = separator.replaceAll("\\.", "\\\\.");\r
+ int index = sourceText.indexOf(separator);\r
+ while (index >= 0) {\r
+ int indexSpaceAfter = sourceText.indexOf(" ", index);\r
+ String substring = null;\r
+ if (indexSpaceAfter >= 0) {\r
+ substring = sourceText.substring(0, indexSpaceAfter);\r
+ sourceText = sourceText.substring(indexSpaceAfter);\r
+ } else {\r
+ substring = sourceText;\r
+ sourceText = "";\r
+ }\r
+\r
+ int indexSpaceBefore = substring.lastIndexOf(" ");\r
+ if (indexSpaceBefore >= 0) {\r
+ substring = substring.substring(indexSpaceBefore + 1);\r
+ }\r
+ if (separator.indexOf(".") >= 0) {\r
+ separator = separator.replaceAll("\\.", "\\\\.");\r
// System.out.println("++++++++++ separator: " + separator);\r
- }\r
- String[] tokens = substring.split(separator);\r
+ }\r
+ String[] tokens = substring.split(separator);\r
\r
for (String token : tokens) {\r
acumulator = acumulator + " " + token;\r
}\r
\r
index = sourceText.indexOf(separator);\r
- }\r
-\r
- return acumulator;\r
- }\r
- // END OXYGEN PATCH EXM-20414\r
- public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {\r
- // START OXYGEN PATCH, remove element from stack\r
- stack.pop();\r
- // END OXYGEN PATCH\r
- if(qName.equalsIgnoreCase("title")) {\r
- //add it to the list\r
- //myEmpls.add(tempEmp);\r
- fileDesc.setTitle(BlankRemover.rmWhiteSpace(tempVal.toString()));\r
- tempVal = null;\r
- }\r
- else if (shortdescBool) {\r
- shortTagCpt --;\r
- if (shortTagCpt == 0) {\r
- String shortdesc = tempVal.toString().replace('\n', ' ');\r
- if(shortdesc.trim().length() > 0) {\r
- fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));\r
- }\r
- tempVal = null;\r
- shortdescBool = false;\r
- }\r
- }\r
-\r
- if(qName.equalsIgnoreCase("div") && addContent){\r
+ }\r
+\r
+ return acumulator;\r
+ }\r
+ // END OXYGEN PATCH EXM-20414\r
+\r
+ public void endElement(String uri, String localName, String qName) throws org.xml.sax.SAXException {\r
+ // START OXYGEN PATCH, remove element from stack\r
+ stack.pop();\r
+ // END OXYGEN PATCH\r
+ if (qName.equalsIgnoreCase("title")) {\r
+ //add it to the list\r
+ //myEmpls.add(tempEmp);\r
+ fileDesc.setTitle(BlankRemover.rmWhiteSpace(tempVal.toString()));\r
+ tempVal = null;\r
+ } else if (shortdescBool) {\r
+ shortTagCpt--;\r
+ if (shortTagCpt == 0) {\r
+ String shortdesc = tempVal.toString().replace('\n', ' ');\r
+ if (shortdesc.trim().length() > 0) {\r
+ fileDesc.setShortdesc(BlankRemover.rmWhiteSpace(shortdesc));\r
+ }\r
+ tempVal = null;\r
+ shortdescBool = false;\r
+ }\r
+ }\r
+\r
+ if (qName.equalsIgnoreCase("div") && addContent) {\r
divCount--;\r
if (divCount == 0) {\r
addContent = false;\r
}\r
}\r
- }\r
+ }\r
+\r
+ public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {\r
+ //do nothing\r
\r
- public void processingInstruction(String target, String data) throws org.xml.sax.SAXException {\r
- //do nothing\r
+ }\r
\r
- }\r
+ /*public InputSource resolveEntity(String publicId, String systemId)\r
+ throws IOException, SAXException {\r
\r
- /*public InputSource resolveEntity(String publicId, String systemId)\r
- throws IOException, SAXException {\r
+ // use the catalog to solve the doctype\r
+ System.out.println("entities " + publicId + systemId);\r
+ return null;\r
+ }*/\r
\r
- // use the catalog to solve the doctype\r
- System.out.println("entities " + publicId + systemId);\r
- return null;\r
- }*/\r
- public org.xml.sax.InputSource resolveEntity(String publicId, String systemId)\r
- throws org.xml.sax.SAXException, IOException {\r
- //System.out.println("Entities " + publicId + "and" + systemId);\r
- // use dita ot (dost.jar) for resolving dtd paths using the calatog\r
+ public org.xml.sax.InputSource resolveEntity(String publicId, String systemId)\r
+ throws org.xml.sax.SAXException, IOException {\r
+ //System.out.println("Entities " + publicId + "and" + systemId);\r
+ // use dita ot (dost.jar) for resolving dtd paths using the calatog\r
\r
- return null;\r
- }\r
+ return null;\r
+ }\r
\r
/**\r
* Removes the validation in html files, such as xml version and DTDs\r
+ *\r
* @param file the html file\r
* @return int: returns 0 if no IOException occurs, else 1.\r
*/\r
- public String RemoveValidationPI (File file) {\r
+ public String RemoveValidationPI(File file) {\r
StringBuilder sb = new StringBuilder();\r
- //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
- // be the source to index the content of the particular html page.\r
- try {\r
- BufferedReader br = new BufferedReader(\r
- new InputStreamReader(\r
- new FileInputStream(file),"UTF-8"));\r
-\r
- while(true)\r
- {\r
- int i1, i2;\r
- boolean ok = true;\r
- try {\r
-\r
- String line = br.readLine();\r
-\r
- if (line == null) {\r
- break;\r
- }\r
- //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");\r
-\r
- line = line.replaceAll("\\x26nbsp\\x3B", " ");\r
-\r
- if (!line.contains("<!DOCTYPE html PUBLIC")) {\r
- //dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.\r
- if (line.contains("<?xml version")) {\r
- line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
- }\r
+ //The content that needs to be indexed after removing validation will be written to sb. This StringBuilder will\r
+ // be the source to index the content of the particular html page.\r
+ try {\r
+ BufferedReader br = new BufferedReader(\r
+ new InputStreamReader(\r
+ new FileInputStream(file), "UTF-8"));\r
+\r
+ while (true) {\r
+ int i1, i2;\r
+ boolean ok = true;\r
+ try {\r
+\r
+ String line = br.readLine();\r
+\r
+ if (line == null) {\r
+ break;\r
+ }\r
+ //ok = line.matches("(.)*\\x26nbsp\\x3B(.)*");\r
+\r
+ line = line.replaceAll("\\x26nbsp\\x3B", " ");\r
+\r
+ if (!line.contains("<!DOCTYPE html PUBLIC")) {\r
+ //dwc: This doesn't really apply to me. I already omit the xml pi for other reasons.\r
+ if (line.contains("<?xml version")) {\r
+ line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E", "\n");\r
+ }\r
\r
sb.append(line).append("\n");\r
- } else\r
- {\r
- //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
- i1 = line.indexOf("<!DOCTYPE");\r
- i2 = line.indexOf(">", i1);\r
- while (i2 < 0) {\r
-\r
- line = line.concat(br.readLine());\r
- i2 = line.indexOf(">", i1);\r
- }\r
- String temp = line.substring(i1, i2);\r
-\r
- //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");\r
- if (line.contains("<?xml version")) {\r
- line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E","\n");\r
- }\r
- line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E","\n");\r
+ } else {\r
+ //dwc: What is this trying to do? Nuke the DOCTYPE? Why?\r
+ i1 = line.indexOf("<!DOCTYPE");\r
+ i2 = line.indexOf(">", i1);\r
+ while (i2 < 0) {\r
+\r
+ line = line.concat(br.readLine());\r
+ i2 = line.indexOf(">", i1);\r
+ }\r
+ String temp = line.substring(i1, i2);\r
+\r
+ //ok = line.matches("(.)*\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E(.)*");\r
+ if (line.contains("<?xml version")) {\r
+ line = line.replaceAll("\\x3C\\x3Fxml[^\\x3E]*\\x3F\\x3E", "\n");\r
+ }\r
+ line = line.replaceAll("\\x3C\\x21DOCTYPE[^\\x3E]*\\x3E", "\n");\r
\r
sb.append(line);\r
- }\r
- }\r
- catch (IOException e)\r
- {\r
- break;\r
- }\r
- }\r
-\r
- br.close();\r
- }\r
- catch (IOException e)\r
- {\r
- return null;\r
- }\r
-\r
- return sb.toString(); // return status\r
-\r
- }\r
-\r
- // START OXYGEN PATCH, moved from subclass\r
- protected String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
- String tempPunctuation = null;\r
- if (tempCharBuf!= null) {\r
- tempPunctuation = new String(tempCharBuf);\r
- }\r
-\r
- str = str.replaceAll("\\s+", " ");\r
- str = str.replaceAll("->", " ");\r
- str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
- str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
- if (tempPunctuation != null && tempPunctuation.length() > 0)\r
- {\r
- str = str.replaceAll(tempPunctuation, " ");\r
- }\r
-\r
- if (tempStrBuf != null) {\r
- //remove useless words\r
- str = str.replaceAll(tempStrBuf.toString(), " ");\r
- }\r
-\r
- // Redo punctuation after removing some words: (TODO: useful?)\r
- str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
- str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
- str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
- if (tempPunctuation != null && tempPunctuation.length() > 0)\r
- {\r
- str = str.replaceAll(tempPunctuation, " ");\r
- } return str;\r
- }\r
- // END OXYGEN PATCH\r
+ }\r
+ }\r
+ catch (IOException e) {\r
+ break;\r
+ }\r
+ }\r
+\r
+ br.close();\r
+ }\r
+ catch (IOException e) {\r
+ return null;\r
+ }\r
+\r
+ return sb.toString(); // return status\r
+\r
+ }\r
+\r
+ // START OXYGEN PATCH, moved from subclass\r
+\r
+ protected String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
+ String tempPunctuation = null;\r
+ if (tempCharBuf != null) {\r
+ tempPunctuation = new String(tempCharBuf);\r
+ }\r
+\r
+ str = str.replaceAll("\\s+", " ");\r
+ str = str.replaceAll("->", " ");\r
+ str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
+ str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
+ if (tempPunctuation != null && tempPunctuation.length() > 0) {\r
+ str = str.replaceAll(tempPunctuation, " ");\r
+ }\r
+\r
+ if (tempStrBuf != null) {\r
+ //remove useless words\r
+ str = str.replaceAll(tempStrBuf.toString(), " ");\r
+ }\r
+\r
+ // Redo punctuation after removing some words: (TODO: useful?)\r
+ str = str.replaceAll(IndexerConstants.EUPUNCTUATION1, " ");\r
+ str = str.replaceAll(IndexerConstants.EUPUNCTUATION2, " ");\r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION1, " ");\r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION2, " ");\r
+ str = str.replaceAll(IndexerConstants.JPPUNCTUATION3, " ");\r
+ if (tempPunctuation != null && tempPunctuation.length() > 0) {\r
+ str = str.replaceAll(tempPunctuation, " ");\r
+ }\r
+ return str;\r
+ }\r
+ // END OXYGEN PATCH\r
}\r
import com.nexwave.stemmer.snowball.ext.FrenchStemmer;\r
import com.nexwave.stemmer.snowball.ext.GermanStemmer;\r
\r
+//client-side support is yet to come for these stemmers\r
import com.nexwave.stemmer.snowball.ext.danishStemmer;\r
import com.nexwave.stemmer.snowball.ext.dutchStemmer;\r
import com.nexwave.stemmer.snowball.ext.finnishStemmer;\r
* NOTE: This indexes only the content under a tag with ID "content".\r
* Wrap html content with a div tag with id "content" to index relevant parts of your page.\r
*\r
- * @version 2.0 2010\r
- *\r
* @author N. Quaine\r
* @author Kasun Gajasinghe <http://kasunbg.blogspot.com>\r
+ * @version 2.0 2010\r
*/\r
-public class SaxHTMLIndex extends SaxDocFileParser{\r
+public class SaxHTMLIndex extends SaxDocFileParser {\r
\r
//KasunBG: apparently tempDico stores all the keywords and a pointer to the files containing the index in a Map\r
//example: ("keyword1", "0,2,4"), ("docbook", "1,2,5") \r
- private Map<String,String> tempDico;\r
- private int i = 0;\r
- private ArrayList <String> cleanUpList = null;\r
- private ArrayList <String> cleanUpPunctuation = null;\r
-\r
- // START OXYGEN PATCH, scoring for HTML elements\r
- private int SCORING_FOR_H1 = 50;\r
- private int SCORING_FOR_H2 = 45;\r
- private int SCORING_FOR_H3 = 40;\r
- private int SCORING_FOR_H4 = 35;\r
- private int SCORING_FOR_H5 = 30;\r
- private int SCORING_FOR_H6 = 25;\r
- private int SCORING_FOR_BOLD = 5;\r
- private int SCORING_FOR_ITALIC = 3;\r
- private int SCORING_FOR_NORMAL_TEXT = 1;\r
- private int SCORING_FOR_KEYWORD = 100;\r
- private int SCORING_FOR_INDEXTERM = 75;\r
- \r
- /**\r
- * The list with the word and scoring object\r
- */\r
- private List<WordAndScoring> wsList = null;\r
-\r
- /**\r
- * Used for Oxygen TestCases\r
- * @return the wsList\r
- */\r
- public List<WordAndScoring> getWsList() {\r
- return wsList;\r
- }\r
- // END OXYGEN PATCH\r
- //methods\r
- /**\r
- * Constructor\r
- */\r
- public SaxHTMLIndex () {\r
- super();\r
- }\r
- /**\r
- * Constructor\r
+ private Map<String, String> tempDico;\r
+ private int i = 0;\r
+ private ArrayList<String> cleanUpList = null;\r
+ private ArrayList<String> cleanUpPunctuation = null;\r
+\r
+ // START OXYGEN PATCH, scoring for HTML elements\r
+ private int SCORING_FOR_H1 = 50;\r
+ private int SCORING_FOR_H2 = 45;\r
+ private int SCORING_FOR_H3 = 40;\r
+ private int SCORING_FOR_H4 = 35;\r
+ private int SCORING_FOR_H5 = 30;\r
+ private int SCORING_FOR_H6 = 25;\r
+ private int SCORING_FOR_BOLD = 5;\r
+ private int SCORING_FOR_ITALIC = 3;\r
+ private int SCORING_FOR_NORMAL_TEXT = 1;\r
+ private int SCORING_FOR_KEYWORD = 100;\r
+ private int SCORING_FOR_INDEXTERM = 75;\r
+\r
+ /**\r
+ * The list with the word and scoring object\r
+ */\r
+ private List<WordAndScoring> wsList = null;\r
+\r
+ /**\r
+ * Used for Oxygen TestCases\r
+ *\r
+ * @return the wsList\r
+ */\r
+ public List<WordAndScoring> getWsList() {\r
+ return wsList;\r
+ }\r
+ // END OXYGEN PATCH\r
+ //methods\r
+\r
+ /**\r
+ * Constructor\r
+ */\r
+ public SaxHTMLIndex() {\r
+ super();\r
+ }\r
+\r
+ /**\r
+ * Constructor\r
+ *\r
* @param cleanUpStrings\r
*/\r
- public SaxHTMLIndex (ArrayList <String> cleanUpStrings) {\r
- super();\r
- cleanUpList = cleanUpStrings;\r
- }\r
- /**\r
- * Constructor\r
+ public SaxHTMLIndex(ArrayList<String> cleanUpStrings) {\r
+ super();\r
+ cleanUpList = cleanUpStrings;\r
+ }\r
+\r
+ /**\r
+ * Constructor\r
+ *\r
* @param cleanUpStrings\r
* @param cleanUpChars\r
*/\r
- public SaxHTMLIndex (ArrayList <String> cleanUpStrings, ArrayList <String> cleanUpChars) {\r
- super();\r
- cleanUpList = cleanUpStrings;\r
- cleanUpPunctuation = cleanUpChars;\r
- }\r
-\r
- /**\r
- * Initializer\r
+ public SaxHTMLIndex(ArrayList<String> cleanUpStrings, ArrayList<String> cleanUpChars) {\r
+ super();\r
+ cleanUpList = cleanUpStrings;\r
+ cleanUpPunctuation = cleanUpChars;\r
+ }\r
+\r
+ /**\r
+ * Initializer\r
+ *\r
* @param tempMap\r
*/\r
- public int init(Map<String,String> tempMap){\r
- tempDico = tempMap;\r
- return 0;\r
- }\r
-\r
- /**\r
- * Parses the file to extract all the words for indexing and\r
- * some data characterizing the file.\r
- * @param file contains the fullpath of the document to parse\r
+ public int init(Map<String, String> tempMap) {\r
+ tempDico = tempMap;\r
+ return 0;\r
+ }\r
+\r
+ /**\r
+ * Parses the file to extract all the words for indexing and\r
+ * some data characterizing the file.\r
+ *\r
+ * @param file contains the fullpath of the document to parse\r
* @param indexerLanguage this will be used to tell the program which stemmer to be used.\r
- * @param stem if true then generate js files with words stemmed\r
- * @return a DitaFileInfo object filled with data describing the file\r
- */\r
- public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) {\r
- //initialization\r
- fileDesc = new DocFileInfo(file);\r
- strbf = new StringBuffer("");\r
-\r
- // Fill strbf by parsing the file\r
- parseDocument(file);\r
-\r
- String str = cleanBuffer(strbf);\r
- str = str.replaceAll("\\s+"," "); //there's still redundant spaces in the middle\r
+ * @param stem if true then generate js files with words stemmed\r
+ * @return a DitaFileInfo object filled with data describing the file\r
+ */\r
+ public DocFileInfo runExtractData(File file, String indexerLanguage, boolean stem) {\r
+ //initialization\r
+ fileDesc = new DocFileInfo(file);\r
+ strbf = new StringBuffer("");\r
+\r
+ // Fill strbf by parsing the file\r
+ parseDocument(file);\r
+\r
+ String str = cleanBuffer(strbf);\r
+ str = str.replaceAll("\\s+", " "); //there's still redundant spaces in the middle\r
// System.out.println(file.toString()+" "+ str +"\n");\r
// START OXYGEN PATCH\r
// String[] items = str.split("\\s"); //contains all the words in the array\r
// START OXYGEN PATCH, create the words and scoring list\r
// String[] tokenizedItems;\r
// END OXYGEN PATCH\r
- if(indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh")\r
- || indexerLanguage.equalsIgnoreCase("ko")){\r
- LinkedList<String> tokens = new LinkedList<String>();\r
- try{\r
- //EXM-21501 Oxygen patch, replace the extra "@@@"s.\r
- str = str.replaceAll("@@@([^\\s]*)@@@", "");\r
+ if (indexerLanguage.equalsIgnoreCase("ja") || indexerLanguage.equalsIgnoreCase("zh")\r
+ || indexerLanguage.equalsIgnoreCase("ko")) {\r
+ LinkedList<String> tokens = new LinkedList<String>();\r
+ try {\r
+ //EXM-21501 Oxygen patch, replace the extra "@@@"s.\r
+ str = str.replaceAll("@@@([^\\s]*)@@@", "");\r
CJKAnalyzer analyzer = new CJKAnalyzer(org.apache.lucene.util.Version.LUCENE_30);\r
Reader reader = new StringReader(str);\r
TokenStream stream = analyzer.tokenStream("", reader);\r
}\r
\r
}\r
- if (!found) {\r
- wsList.add(ws);\r
- }\r
+ if (!found) {\r
+ wsList.add(ws);\r
+ }\r
}\r
// START OXYGEN PATCH\r
//tokenizedItems = tokens.toArray(new String[tokens.size()]);\r
// END OXYGEN PATCH\r
\r
- }catch (IOException ex){\r
- // START OXYGEN PATCH\r
+ } catch (IOException ex) {\r
+ // START OXYGEN PATCH\r
// tokenizedItems = items;\r
- // END OXYGEN PATCH\r
+ // END OXYGEN PATCH\r
System.out.println("Error tokenizing content using CJK Analyzer. IOException");\r
ex.printStackTrace();\r
}\r
} else {\r
SnowballStemmer stemmer;\r
- if(indexerLanguage.equalsIgnoreCase("en")){\r
- stemmer = new EnglishStemmer();\r
- } else if (indexerLanguage.equalsIgnoreCase("de")){\r
- stemmer= new GermanStemmer();\r
- } else if (indexerLanguage.equalsIgnoreCase("fr")){\r
- stemmer= new FrenchStemmer();\r
+ if (indexerLanguage.equalsIgnoreCase("en")) {\r
+ stemmer = new EnglishStemmer();\r
+ } else if (indexerLanguage.equalsIgnoreCase("de")) {\r
+ stemmer = new GermanStemmer();\r
+ } else if (indexerLanguage.equalsIgnoreCase("fr")) {\r
+ stemmer = new FrenchStemmer();\r
} else {\r
stemmer = null;//Languages which stemming is not yet supported.So, No stemmers will be used.\r
}\r
StringTokenizer st = new StringTokenizer(str, " ");\r
// Tokenize the string and populate the words and scoring list\r
while (st.hasMoreTokens()) {\r
- String token = st.nextToken();\r
- WordAndScoring ws = getWordAndScoring(token, stemmer, stem);\r
- if (ws != null) {\r
- boolean found = false;\r
+ String token = st.nextToken();\r
+ WordAndScoring ws = getWordAndScoring(token, stemmer, stem);\r
+ if (ws != null) {\r
+ boolean found = false;\r
for (WordAndScoring aWsList : wsList) {\r
// If the stem of the current word is already in list,\r
// do not add the word in the list, just recompute scoring\r
break;\r
}\r
}\r
- if (!found) {\r
- wsList.add(ws);\r
- }\r
- } \r
- } \r
+ if (!found) {\r
+ wsList.add(ws);\r
+ }\r
+ }\r
+ }\r
// if(stemmer != null) //If a stemmer available\r
// tokenizedItems = stemmer.doStem(items.toArray(new String[0]));\r
// else //if no stemmer available for the particular language\r
\r
}\r
\r
- /* for(String stemmedItem: tokenizedItems){\r
+ /* for(String stemmedItem: tokenizedItems){\r
System.out.print(stemmedItem+"| ");\r
}*/\r
\r
Iterator<WordAndScoring> it = wsList.iterator();\r
WordAndScoring s;\r
while (it.hasNext()) {\r
- s = it.next();\r
- // Do not add results from 'toc.html'\r
- if (s != null && tempDico.containsKey(s.getStem())) {\r
- String temp = tempDico.get(s.getStem());\r
- temp = temp.concat(",").concat(Integer.toString(i))\r
- // Concat also the scoring for the stem\r
- .concat("*").concat(Integer.toString(s.getScoring()))\r
- ;\r
- //System.out.println("temp="+s+"="+temp);\r
- tempDico.put(s.getStem(), temp);\r
- }else if (s != null) {\r
- String temp = null;\r
- temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
- tempDico.put(s.getStem(), temp);\r
+ s = it.next();\r
+ // Do not add results from 'toc.html'\r
+ if (s != null && tempDico.containsKey(s.getStem())) {\r
+ String temp = tempDico.get(s.getStem());\r
+ temp = temp.concat(",").concat(Integer.toString(i))\r
+ // Concat also the scoring for the stem\r
+ .concat("*").concat(Integer.toString(s.getScoring()))\r
+ ;\r
+ //System.out.println("temp="+s+"="+temp);\r
+ tempDico.put(s.getStem(), temp);\r
+ } else if (s != null) {\r
+ String temp = null;\r
+ temp = Integer.toString(i).concat("*").concat(Integer.toString(s.getScoring()));\r
+ tempDico.put(s.getStem(), temp);\r
}\r
- // END OXYGEN PATCH\r
+ // END OXYGEN PATCH\r
}\r
\r
i++;\r
- return fileDesc;\r
- }\r
-\r
- // START OXYGEN PATCH\r
- /**\r
- * Get the word, stem and scoring for the given token.\r
- * @param token The token to parse.\r
- * @param stemmer The stemmer.\r
- * @param doStemming If true then generate js files with words stemmed.\r
- * @return the word, stem and scoring for the given token.\r
- */\r
- private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) {\r
- WordAndScoring wordScoring = null;\r
- if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) {\r
- // Extract the word from token\r
- String word = token.substring(0, token.indexOf("@@@"));\r
- if (word.length() > 0) {\r
- // Extract the element name from token\r
- String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@"));\r
- // Compute scoring\r
- int scoring = SCORING_FOR_NORMAL_TEXT;\r
- if ("h1".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_H1;\r
- } else if ("h2".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_H2;\r
- } else if ("h3".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_H3;\r
- } else if ("h4".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_H4;\r
- } else if ("h5".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_H5;\r
- } else if ("h6".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_H6;\r
- } else if ("em".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_ITALIC;\r
- } else if ("strong".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_BOLD;\r
- } else if ("meta_keywords".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_KEYWORD;\r
- } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {\r
- scoring = SCORING_FOR_INDEXTERM;\r
- }\r
- // Get the stemmed word\r
- String stemWord = word;\r
- if (stemmer != null && doStemming) {\r
- stemWord = stemmer.doStem(word);\r
- }\r
- wordScoring = new WordAndScoring(word, stemWord, scoring);\r
- }\r
- } else {\r
- // The token contains only the word\r
- String stemWord = token;\r
- // Stem the word\r
- if (stemmer != null && doStemming) {\r
- stemWord = stemmer.doStem(token);\r
- }\r
- wordScoring = new WordAndScoring(token, stemWord, SCORING_FOR_NORMAL_TEXT);\r
- }\r
- return wordScoring;\r
- }\r
- // END OXYGEN PATCH\r
-\r
- /**\r
- * Cleans the string buffer containing all the text retrieved from\r
- * the html file: remove punctuation, clean white spaces, remove the words\r
- * which you do not want to index.\r
- * NOTE: You may customize this function:\r
- * This version takes into account english and japanese. Depending on your\r
- * needs,\r
- * you may have to add/remove some characters/words through props files\r
- * or by modifying tte default code,\r
- * you may want to separate the language processing (doc only in japanese,\r
- * doc only in english, check the language metadata ...).\r
- */\r
- private String cleanBuffer (StringBuffer strbf) {\r
- String str = strbf.toString().toLowerCase();\r
- StringBuffer tempStrBuf = new StringBuffer("");\r
- StringBuffer tempCharBuf = new StringBuffer("");\r
- if ((cleanUpList == null) || (cleanUpList.isEmpty())){\r
- // Default clean-up\r
-\r
- // Should perhaps eliminate the words at the end of the table?\r
- tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
- tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
- tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");\r
- tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");\r
- tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
- tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
+ return fileDesc;\r
+ }\r
+\r
+ // START OXYGEN PATCH\r
+\r
+ /**\r
+ * Get the word, stem and scoring for the given token.\r
+ *\r
+ * @param token The token to parse.\r
+ * @param stemmer The stemmer.\r
+ * @param doStemming If true then generate js files with words stemmed.\r
+ * @return the word, stem and scoring for the given token.\r
+ */\r
+ private WordAndScoring getWordAndScoring(String token, SnowballStemmer stemmer, boolean doStemming) {\r
+ WordAndScoring wordScoring = null;\r
+ if (token.indexOf("@@@") != -1 && token.indexOf("@@@") != token.lastIndexOf("@@@")) {\r
+ // Extract the word from token\r
+ String word = token.substring(0, token.indexOf("@@@"));\r
+ if (word.length() > 0) {\r
+ // Extract the element name from token\r
+ String elementName = token.substring(token.indexOf("@@@elem_") + "@@@elem_".length(), token.lastIndexOf("@@@"));\r
+ // Compute scoring\r
+ int scoring = SCORING_FOR_NORMAL_TEXT;\r
+ if ("h1".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_H1;\r
+ } else if ("h2".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_H2;\r
+ } else if ("h3".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_H3;\r
+ } else if ("h4".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_H4;\r
+ } else if ("h5".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_H5;\r
+ } else if ("h6".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_H6;\r
+ } else if ("em".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_ITALIC;\r
+ } else if ("strong".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_BOLD;\r
+ } else if ("meta_keywords".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_KEYWORD;\r
+ } else if ("meta_indexterms".equalsIgnoreCase(elementName)) {\r
+ scoring = SCORING_FOR_INDEXTERM;\r
+ }\r
+ // Get the stemmed word\r
+ String stemWord = word;\r
+ if (stemmer != null && doStemming) {\r
+ stemWord = stemmer.doStem(word);\r
+ }\r
+ wordScoring = new WordAndScoring(word, stemWord, scoring);\r
+ }\r
+ } else {\r
+ // The token contains only the word\r
+ String stemWord = token;\r
+ // Stem the word\r
+ if (stemmer != null && doStemming) {\r
+ stemWord = stemmer.doStem(token);\r
+ }\r
+ wordScoring = new WordAndScoring(token, stemWord, SCORING_FOR_NORMAL_TEXT);\r
+ }\r
+ return wordScoring;\r
+ }\r
+ // END OXYGEN PATCH\r
+\r
+ /**\r
+ * Cleans the string buffer containing all the text retrieved from\r
+ * the html file: remove punctuation, clean white spaces, remove the words\r
+ * which you do not want to index.\r
+ * NOTE: You may customize this function:\r
+ * This version takes into account english and japanese. Depending on your\r
+ * needs,\r
+ * you may have to add/remove some characters/words through props files\r
+ * or by modifying tte default code,\r
+ * you may want to separate the language processing (doc only in japanese,\r
+ * doc only in english, check the language metadata ...).\r
+ */\r
+ private String cleanBuffer(StringBuffer strbf) {\r
+ String str = strbf.toString().toLowerCase();\r
+ StringBuffer tempStrBuf = new StringBuffer("");\r
+ StringBuffer tempCharBuf = new StringBuffer("");\r
+ if ((cleanUpList == null) || (cleanUpList.isEmpty())) {\r
+ // Default clean-up\r
+\r
+ // Should perhaps eliminate the words at the end of the table?\r
+ tempStrBuf.append("(?i)\\bthe\\b|\\ba\\b|\\ban\\b|\\bto\\b|\\band\\b|\\bor\\b");//(?i) ignores the case\r
+ tempStrBuf.append("|\\bis\\b|\\bare\\b|\\bin\\b|\\bwith\\b|\\bbe\\b|\\bcan\\b");\r
+ tempStrBuf.append("|\\beach\\b|\\bhas\\b|\\bhave\\b|\\bof\\b|\\b\\xA9\\b|\\bnot\\b");\r
+ tempStrBuf.append("|\\bfor\\b|\\bthis\\b|\\bas\\b|\\bit\\b|\\bhe\\b|\\bshe\\b");\r
+ tempStrBuf.append("|\\byou\\b|\\bby\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
+ tempStrBuf.append("|\\b-or-\\b|\\bso\\b|\\bon\\b|\\byour\\b|\\bat\\b");\r
tempStrBuf.append("|\\bI\\b|\\bme\\b|\\bmy\\b");\r
\r
- str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " ");\r
+ str = str.replaceFirst("Copyright � 1998-2007 NexWave Solutions.", " ");\r
\r
\r
- //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
- // remove contiguous white charaters\r
- //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
- }else {\r
- // Clean-up using the props files\r
- tempStrBuf.append("\\ba\\b");\r
+ //nqu 25.01.2008 str = str.replaceAll("\\b.\\b|\\\\", " ");\r
+ // remove contiguous white charaters\r
+ //nqu 25.01.2008 str = str.replaceAll("\\s+", " ");\r
+ } else {\r
+ // Clean-up using the props files\r
+ tempStrBuf.append("\\ba\\b");\r
for (String aCleanUp : cleanUpList) {\r
- tempStrBuf.append("|\\b").append(aCleanUp ).append("\\b");\r
+ tempStrBuf.append("|\\b").append(aCleanUp).append("\\b");\r
}\r
- }\r
- if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())){\r
- tempCharBuf.append("\\u3002");\r
+ }\r
+ if ((cleanUpPunctuation != null) && (!cleanUpPunctuation.isEmpty())) {\r
+ tempCharBuf.append("\\u3002");\r
for (String aCleanUpPunctuation : cleanUpPunctuation) {\r
tempCharBuf.append("|").append(aCleanUpPunctuation);\r
}\r
- }\r
+ }\r
\r
- str = minimalClean(str, tempStrBuf, tempCharBuf);\r
- return str;\r
- }\r
+ str = minimalClean(str, tempStrBuf, tempCharBuf);\r
+ return str;\r
+ }\r
\r
- // OXYGEN PATCH, moved method in superclass\r
+ // OXYGEN PATCH, moved method in superclass\r
// private String minimalClean(String str, StringBuffer tempStrBuf, StringBuffer tempCharBuf) {\r
// String tempPunctuation = new String(tempCharBuf);\r
//\r
// str = str.replaceAll(tempPunctuation, " ");\r
// } return str;\r
// }\r
- // END OXYGEN PATCH\r
+ // END OXYGEN PATCH\r
\r
}\r
*/\r
/**\r
* For running tests with the indexertask.\r
- * \r
+ *\r
* @version 2.0 2010-08-14\r
- * \r
+ *\r
* @author N. Quaine\r
* @author Kasun Gajasinghe\r
*//*\r
public static IndexerTask IT = null; \r
*/\r
/**\r
- * @param args\r
- * @throws InterruptedException \r
- *//*\r
+ * @param args\r
+ * @throws InterruptedException\r
+ *//*\r
\r
public static void main(String[] args) throws InterruptedException {\r
if (args.length != 0) {\r
* Outputs the js files with:
* - the list of html files and their description
* - the words retrieved from the html files and their location
- *
+ * <p/>
* 20110803: Adding improvements from Radu/Oxygen.
- *
+ *
* @author N. Quaine
* @author Kasun Gajasinghe
* @version 2.0 2010-08-13
private static String txt_VM_encoding_not_supported = "This VM does not support the specified encoding.";
private static String txt_indices_location = "The created index files are located in ";
- /** Create a javascript array listing the html files with their paths relative to the project root
- * @param fileO path and name of the file in which to output the list of html files
- * @param list of the html files, relative to the doc root directory
- * @param doStem If true then js files will generate words stemmed
+ /**
+ * Create a javascript array listing the html files with their paths relative to the project root
+ *
+ * @param fileO path and name of the file in which to output the list of html files
+ * @param list of the html files, relative to the doc root directory
+ * @param doStem If true then js files will generate words stemmed
*/
- public static void WriteHTMLList (String fileO,ArrayList<String> list, boolean doStem) {
+ public static void WriteHTMLList(String fileO, ArrayList<String> list, boolean doStem) {
int i = 0;
Iterator it;
i++;
}
- out.write("var doStem = " + doStem + "");
+ out.write("var doStem = " + doStem + "");
out.flush(); // Don't forget to flush!
out.close();
// System.out.println("the array of html is in " + fileO);
}
- /** Create a javascript array listing the html files with
+ /**
+ * Create a javascript array listing the html files with
* their paths relative to project root, their titles and shortdescs
+ *
* @param fileO path and name of the file in which to output the list of html files
* @param list of the html files, relative to the doc root directory
*/
if (tempTitle != null) {
tempTitle = tempTitle.replaceAll("\\s+", " ");
tempTitle = tempTitle.replaceAll("['�\"]", " ");
- //EXM-21239 Escape "\"
- tempTitle = tempTitle.replaceAll("\\\\", "\\\\\\\\");
+ //EXM-21239 Escape "\"
+ tempTitle = tempTitle.replaceAll("\\\\", "\\\\\\\\");
}
if (tempShortdesc != null) {
tempShortdesc = tempShortdesc.replaceAll("\\s+", " ");
tempShortdesc = tempShortdesc.replaceAll("['�\"]", " ");
- //EXM-21239 Escape "\"
- tempShortdesc = tempShortdesc.replaceAll("\\\\", "\\\\\\\\");
+ //EXM-21239 Escape "\"
+ tempShortdesc = tempShortdesc.replaceAll("\\\\", "\\\\\\\\");
}
- if (tempShortdesc != null) {
- String stripNonAlphabeticalChars = stripNonAlphabeticalChars(tempShortdesc);
- //stripNonAlphabeticalChars = stripWords(stripNonAlphabeticalChars);
- stripNonAlphabeticalChars = stripNonAlphabeticalChars + "...";
- out.write("fil[\""+i+"\"]"+"= \""+tempPath+"@@@"+tempTitle+"@@@"+stripNonAlphabeticalChars+"\";\n");
- i++;
- }else{
- out.write("fil[\""+i+"\"]"+"= \""+tempPath+"@@@"+tempTitle+"@@@null"+"\";\n");
- i++;
+ if (tempShortdesc != null) {
+ String stripNonAlphabeticalChars = stripNonAlphabeticalChars(tempShortdesc);
+ //stripNonAlphabeticalChars = stripWords(stripNonAlphabeticalChars);
+ stripNonAlphabeticalChars = stripNonAlphabeticalChars + "...";
+ out.write("fil[\"" + i + "\"]" + "= \"" + tempPath + "@@@" + tempTitle + "@@@" + stripNonAlphabeticalChars + "\";\n");
+ i++;
+ } else {
+ out.write("fil[\"" + i + "\"]" + "= \"" + tempPath + "@@@" + tempTitle + "@@@null" + "\";\n");
+ i++;
- }
+ }
}
out.flush(); // Don't forget to flush!
}
- /** Create javascript index files alphabetically.
+ /**
+ * Create javascript index files alphabetically.
+ *
* @param fileOutStr contains the path and the suffix of the index files to create.
* The first letter of the key is added to the given suffix. For example: e.g. a.js, b.js etc...
* @param indexMap its keys are the indexed words and
}
- /**
- * Remove all non alphabetical chars from the end of a text.
- * @param input The text who will be striped.
- * @return The striped text.
- */
- private static String stripNonAlphabeticalChars(String input) {
- String output = input;
- for (int i = input.length() - 1; i > 0 ; i--) {
- char charAt = input.charAt(i);
- int k = (int)charAt;
- if ((k > 65 && k < 91) || (k > 97 && k < 123) || (k > 48 && k < 58)) {
- return output;
- } else {
- output = output.substring(0, output.length() - 1);
- }
- }
- return output;
- }
-
- private static String stripWords(String input) {
- int idx = input.lastIndexOf(" ");
- if (idx != -1) {
- return input.substring(0, idx);
- } else {
+ /**
+ * Remove all non alphabetical chars from the end of a text.
+ *
+ * @param input The text who will be striped.
+ * @return The striped text.
+ */
+ private static String stripNonAlphabeticalChars(String input) {
+ String output = input;
+ for (int i = input.length() - 1; i > 0; i--) {
+ char charAt = input.charAt(i);
+ int k = (int) charAt;
+ if ((k > 65 && k < 91) || (k > 97 && k < 123) || (k > 48 && k < 58)) {
+ return output;
+ } else {
+ output = output.substring(0, output.length() - 1);
+ }
+ }
+ return output;
+ }
+
+ private static String stripWords(String input) {
+ int idx = input.lastIndexOf(" ");
+ if (idx != -1) {
+ return input.substring(0, idx);
+ } else {
return input;
}
}
\r
//import java.util.regex.;\r
\r
-public class BlankRemover\r
-{\r
+public class BlankRemover {\r
\r
/* remove leading whitespace */\r
public static String ltrim(String source) {\r
- return (source==null)? null : source.replaceAll("^[\\s\u00A0]+", "");\r
+ return (source == null) ? null : source.replaceAll("^[\\s\u00A0]+", "");\r
}\r
\r
/* remove trailing whitespace */\r
public static String rtrim(String source) {\r
- \r
- return (source==null)? null : source.replaceAll("[\\s\u00A0]+$", "");\r
+\r
+ return (source == null) ? null : source.replaceAll("[\\s\u00A0]+$", "");\r
}\r
\r
/* replace multiple whitespace between words with single blank */\r
public static String itrim(String source) {\r
- return (source==null)? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");\r
+ return (source == null) ? null : source.replaceAll("\\b[\\s\u00A0]{2,}\\b", " ");\r
}\r
\r
/* remove all superfluous whitespace in source string */\r
public static String rmWhiteSpace(String source) {\r
- //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");\r
- return (source==null)? null : itrim(ltrim(rtrim(source)));\r
+ //System.out.println("Trimmed: '" + itrim(ltrim(rtrim(source))) + "'");\r
+ return (source == null) ? null : itrim(ltrim(rtrim(source)));\r
}\r
\r
- public static String lrtrim(String source){\r
- return (source==null)? null : ltrim(rtrim(source));\r
+ public static String lrtrim(String source) {\r
+ return (source == null) ? null : ltrim(rtrim(source));\r
}\r
}\r
import java.util.regex.*;
public class DirList {
-
- ArrayList<File> listFiles = null;
- ArrayList<String> listFilesRelTo = null;
- String [] topicFiles = null;
- public static final int MAX_DEPTH = 10;
-
- public DirList(File inputDir, String regexp, int depth) {
- try {
-
- listFiles = new ArrayList<File> ();
-
- // not yet implemented
- if(regexp == null) {
- for (File f: inputDir.listFiles()) {
- if (!f.isDirectory()){
- listFiles.add(f);
- }else {
- if (depth < MAX_DEPTH ) {
- DirList nsiDoc = new DirList(f,regexp,depth+1);
- listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
- }
- }
- }
- }
- else {
- for (File f: inputDir.listFiles(new DirFilter(regexp))) {
- listFiles.add(f);
- }
+
+ ArrayList<File> listFiles = null;
+ ArrayList<String> listFilesRelTo = null;
+ String[] topicFiles = null;
+ public static final int MAX_DEPTH = 10;
+
+ public DirList(File inputDir, String regexp, int depth) {
+ try {
+
+ listFiles = new ArrayList<File>();
+
+ // not yet implemented
+ if (regexp == null) {
+ for (File f : inputDir.listFiles()) {
+ if (!f.isDirectory()) {
+ listFiles.add(f);
+ } else {
+ if (depth < MAX_DEPTH) {
+ DirList nsiDoc = new DirList(f, regexp, depth + 1);
+ listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
+ }
+ }
+ }
+ } else {
+ for (File f : inputDir.listFiles(new DirFilter(regexp))) {
+ listFiles.add(f);
+ }
// Patch from Oxygen to address problem where directories
// containing . were not traversed.
- for (File f: inputDir.listFiles(new DirFilter(".*"))) {
- if (f.isDirectory()){
- if (depth < MAX_DEPTH ) {
- DirList nsiDoc = new DirList(f,regexp, depth+1);
- listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
- }
- }
- }
- }
- }
- catch(Exception e) {
- // TODO gerer exception
- e.printStackTrace();
+ for (File f : inputDir.listFiles(new DirFilter(".*"))) {
+ if (f.isDirectory()) {
+ if (depth < MAX_DEPTH) {
+ DirList nsiDoc = new DirList(f, regexp, depth + 1);
+ listFiles.addAll(new ArrayList<File>(nsiDoc.getListFiles()));
+ }
+ }
+ }
+ }
+ }
+ catch (Exception e) {
+ // TODO gerer exception
+ e.printStackTrace();
+ }
+ }
+
+ public ArrayList<File> getListFiles() {
+ return this.listFiles;
+ }
+
+ /**
+ * Calculate the path of the files already listed relative to projectDir
+ *
+ * @param projectDir Root from where to calculate the relative path
+ * @return The list of files with their relative path
+ */
+ public ArrayList<String> getListFilesRelTo(String projectDir) {
+ Iterator it;
+
+ if (this.listFiles == null) return null;
+
+ listFilesRelTo = new ArrayList<String>();
+ it = this.listFiles.iterator();
+ while (it.hasNext()) {
+ File ftemp = (File) it.next();
+ String stemp = ftemp.getPath();
+ int i = stemp.indexOf(projectDir);
+ if (i != 0) {
+ System.out.println("the documentation root does not match with the documentation input!");
+ return null;
+ }
+ int ad = 1;
+ if (stemp.equals(projectDir)) ad = 0;
+ stemp = stemp.substring(i + projectDir.length() + ad);
+ listFilesRelTo.add(stemp);
+ }
+ return this.listFilesRelTo;
}
- }
-
- public ArrayList<File> getListFiles() {
- return this.listFiles;
- }
- /**
- * Calculate the path of the files already listed relative to projectDir
- * @param projectDir Root from where to calculate the relative path
- * @return The list of files with their relative path
- */
- public ArrayList<String> getListFilesRelTo(String projectDir) {
- Iterator it;
-
- if (this.listFiles == null) return null;
-
- listFilesRelTo = new ArrayList<String>();
- it = this.listFiles.iterator ( ) ;
- while ( it.hasNext ( ) ) {
- File ftemp = (File) it.next();
- String stemp = ftemp.getPath();
- int i = stemp.indexOf(projectDir);
- if ( i != 0 ) {
- System.out.println("the documentation root does not match with the documentation input!");
- return null;
- }
- int ad = 1;
- if (stemp.equals(projectDir)) ad = 0;
- stemp = stemp.substring(i+projectDir.length()+ad);
- listFilesRelTo.add(stemp);
- }
- return this.listFilesRelTo;
- }
}
class DirFilter implements FilenameFilter {
- private Pattern pattern;
- public DirFilter(String regex) {
- pattern = Pattern.compile(regex);
- }
- public boolean accept(File dir, String name) {
- String thisname = new File(name).getName();
- //System.out.println("Testing: "+ thisname);
- if(thisname.equals("index.html") || thisname.equals("ix01.html")){
- return false;
- }else{
- // Strip path information, search for regex:
- return pattern.matcher(new File(name).getName()).matches();
- }
- }
+ private Pattern pattern;
+
+ public DirFilter(String regex) {
+ pattern = Pattern.compile(regex);
+ }
+
+ public boolean accept(File dir, String name) {
+ String thisname = new File(name).getName();
+ //System.out.println("Testing: "+ thisname);
+ if (thisname.equals("index.html") || thisname.equals("ix01.html")) {
+ return false;
+ } else {
+ // Strip path information, search for regex:
+ return pattern.matcher(new File(name).getName()).matches();
+ }
+ }
}
package com.nexwave.nsidita;\r
\r
import java.io.File;\r
+\r
/**\r
* Object for describing a dita or html file.\r
- * \r
- * @version 2.0 2010-08-14\r
- * \r
+ *\r
* @author N. Quaine\r
+ * @version 2.0 2010-08-14\r
*/\r
public class DocFileInfo {\r
- File fullpath = null;\r
- String title = null;\r
- String shortdesc = null;\r
- String relpathToDocRep = null; //relative path to doc repository (ex: tasks/nexbuilder)\r
- String deltaPathToDocRep = null; // distance from the doc repository (ex: ../..)\r
-\r
- // default constructor\r
- public DocFileInfo() {\r
- }\r
- \r
- public DocFileInfo(File file) {\r
- fullpath = file;\r
- }\r
- \r
- public DocFileInfo(DocFileInfo info) {\r
- this.fullpath = info.fullpath;\r
- this.title = info.title;\r
- this.shortdesc = info.shortdesc;\r
- }\r
- \r
- public void setTitle (String title){\r
- this.title = title;\r
- }\r
-\r
- public void setShortdesc (String shortDesc){\r
- this.shortdesc = shortDesc;\r
- }\r
-\r
- /**\r
- * @return the shortdesc\r
- */\r
- public String getShortdesc() {\r
- return shortdesc;\r
- }\r
-\r
- /**\r
- * @return the title\r
- */\r
- public String getTitle() {\r
- return title;\r
- }\r
-\r
- public File getFullpath() {\r
- return fullpath;\r
- }\r
-\r
- public void setFullpath(File fullpath) {\r
- this.fullpath = fullpath;\r
- }\r
+ File fullpath = null;\r
+ String title = null;\r
+ String shortdesc = null;\r
+ String relpathToDocRep = null; //relative path to doc repository (ex: tasks/nexbuilder)\r
+ String deltaPathToDocRep = null; // distance from the doc repository (ex: ../..)\r
+\r
+ // default constructor\r
+\r
+ public DocFileInfo() {\r
+ }\r
+\r
+ public DocFileInfo(File file) {\r
+ fullpath = file;\r
+ }\r
+\r
+ public DocFileInfo(DocFileInfo info) {\r
+ this.fullpath = info.fullpath;\r
+ this.title = info.title;\r
+ this.shortdesc = info.shortdesc;\r
+ }\r
+\r
+ public void setTitle(String title) {\r
+ this.title = title;\r
+ }\r
+\r
+ public void setShortdesc(String shortDesc) {\r
+ this.shortdesc = shortDesc;\r
+ }\r
+\r
+ /**\r
+ * @return the shortdesc\r
+ */\r
+ public String getShortdesc() {\r
+ return shortdesc;\r
+ }\r
+\r
+ /**\r
+ * @return the title\r
+ */\r
+ public String getTitle() {\r
+ return title;\r
+ }\r
+\r
+ public File getFullpath() {\r
+ return fullpath;\r
+ }\r
+\r
+ public void setFullpath(File fullpath) {\r
+ this.fullpath = fullpath;\r
+ }\r
\r
}\r