jerome
Fri, 03 Mar 2006 14:34:41 -0800
Author: jerome Date: Fri Mar 3 14:33:29 2006 New Revision: 382948 URL: svn.apache.org/viewcvs?rev=382948&view=rev Log: Add a microformats rel-tag parser/indexer/searcher plugin (a la technorati) Added: lucene/nutch/trunk/src/plugin/microformats-reltag/ lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml (with props) lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml (with props) lucene/nutch/trunk/src/plugin/microformats-reltag/src/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (with props) lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (with props) lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java (with props) lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html (with props) Modified: lucene/nutch/trunk/build.xml lucene/nutch/trunk/default.properties lucene/nutch/trunk/src/plugin/build.xml Modified: lucene/nutch/trunk/build.xml URL: svn.apache.org/viewcvs/lucene/nutch/trunk/build.xml?rev=382948&r1=382947&r2=382948&view=diff ============================================================================== --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Fri Mar 3 14:33:29 2006 @@ -249,6 +249,7 @@ <packageset dir="${src.dir}"/> <packageset dir="${plugins.dir}/lib-http/src/java"/> <packageset dir="${plugins.dir}/lib-parsems/src/java"/> + <packageset dir="${plugins.dir}/microformats-reltag/src/java"/> <packageset dir="${plugins.dir}/ontology/src/java"/> <packageset dir="${plugins.dir}/protocol-file/src/java"/> <packageset dir="${plugins.dir}/protocol-ftp/src/java"/> Modified: lucene/nutch/trunk/default.properties URL: svn.apache.org/viewcvs/lucene/nutch/trunk/default.properties?rev=382948&r1=382947&r2=382948&view=diff ============================================================================== --- lucene/nutch/trunk/default.properties (original) +++ lucene/nutch/trunk/default.properties Fri Mar 3 14:33:29 2006 @@ -70,6 +70,7 @@ # plugin.ontology=org.apache.nutch.ontology* plugin.parsems=org.apache.nutch.parse.ms* plugin.pdf=org.apache.nutch.parse.pdf* +plugin.reltag=org.apache.nutch.microformats.reltag* plugin.rss=org.apache.nutch.parse.rss* plugin.rtf=org.apache.nutch.parse.rtf* plugin.site=org.apache.nutch.searcher.site* @@ -98,6 +99,7 @@ ${plugin.msword}:\ ${plugin.parsems}:\ ${plugin.pdf}:\ + ${plugin.reltag}:\ ${plugin.rss}:\ ${plugin.rtf}:\ ${plugin.site}:\ Modified: lucene/nutch/trunk/src/plugin/build.xml URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/build.xml?rev=382948&r1=382947&r2=382948&view=diff ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Fri Mar 3 14:33:29 2006 @@ -18,6 +18,7 @@ <ant dir="lib-lucene-analyzers" target="deploy"/> <ant dir="lib-nekohtml" target="deploy"/> <ant dir="lib-parsems" target="deploy"/> + <ant dir="microformats-reltag" target="deploy"/> <ant dir="nutch-extensionpoints" target="deploy"/> <ant dir="ontology" target="deploy"/> <ant dir="protocol-file" target="deploy"/> @@ -86,6 +87,7 @@ <ant dir="lib-lucene-analyzers" target="clean"/> <ant dir="lib-nekohtml" target="clean"/> <ant dir="lib-parsems" target="clean"/> + <ant dir="microformats-reltag" target="clean"/> <ant dir="nutch-extensionpoints" target="clean"/> <ant dir="ontology" target="clean"/> <ant dir="protocol-file" target="clean"/> Added: lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml?rev=382948&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml (added) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml Fri Mar 3 14:33:29 2006 @@ -0,0 +1,17 @@ +<?xml version="1.0"?> + +<project name="microformats-reltag" default="jar"> + + <import file="../build-plugin.xml"/> + + <!-- Build compilation dependencies --> + <target name="deps-jar"> + <ant target="compile-core" inheritall="false" dir="${nutch.root}"/> + </target> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> + </target> + +</project> Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/build.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml?rev=382948&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml (added) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml Fri Mar 3 14:33:29 2006 @@ -0,0 +1,43 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="microformats-reltag" + name="Rel-Tag microformat Parser/Indexer/Querier" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="microformats-reltag.jar"> + <export name="*"/> + </library> + </runtime> + + <requires> + <import plugin="nutch-extensionpoints"/> + </requires> + + <extension id="org.apache.nutch.microformats.reltag.RelTagParser" + name="Rel-Tag parser" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="RelTagParser" + class="org.apache.nutch.microformats.reltag.RelTagParser"/> + </extension> + + <extension id="org.apache.nutch.microformats.reltag.RelTagIndexingFilter" + name="Rel-Tag indexing filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="RelTagIndexingFilter" + class="org.apache.nutch.microformats.reltag.RelTagIndexingFilter"/> + </extension> + + + <extension id="org.apache.nutch.microformats.reltag.RelTagQueryFilter" + name="Rel-Tag query filter" + point="org.apache.nutch.searcher.QueryFilter"> + <implementation id="RelTagQueryFilter" + class="org.apache.nutch.microformats.reltag.RelTagQueryFilter" + raw-fields="tag"/> + </extension> + + +</plugin> + Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/plugin.xml ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java?rev=382948&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java (added) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java Fri Mar 3 14:33:29 2006 @@ -0,0 +1,82 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + + +// Nutch imports +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.IndexingException; +import org.apache.hadoop.io.UTF8; +import org.apache.nutch.parse.Parse; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Lucene imports +import org.apache.lucene.document.Field; +import org.apache.lucene.document.Document; + + +/** + * An [EMAIL PROTECTED] org.apache.nutch.indexer.IndexingFilter} that + * add <code>tag</code> field(s) to the document. + * + * @see <a class="www.microformats.org/wiki/rel-tag"> + * www.microformats.org/wiki/rel-tag</a> + * @author Jérôme Charron + */ +public class RelTagIndexingFilter implements IndexingFilter { + + + private Configuration conf; + + + // Inherited JavaDoc + public Document filter(Document doc, Parse parse, UTF8 url, CrawlDatum datum, Inlinks inlinks) + throws IndexingException { + + // Check if some Rel-Tags found, possibly put there by RelTagParser + String[] tags = parse.getData().getParseMeta().getValues(RelTagParser.REL_TAG); + if (tags != null) { + for (int i=0; i<tags.length; i++) { + doc.add(new Field("tag", tags[i], + Field.Store.YES, Field.Index.UN_TOKENIZED)); + } + } + + return doc; + } + + + /* ----------------------------- * + * <implementation:Configurable> * + * ----------------------------- */ + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + /* ------------------------------ * + * </implementation:Configurable> * + * ------------------------------ */ + +} Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagIndexingFilter.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=382948&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (added) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Fri Mar 3 14:33:29 2006 @@ -0,0 +1,153 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +// JDK imports +import java.net.URL; +import java.net.URLDecoder; +import java.util.Iterator; +import java.util.Set; +import java.util.TreeSet; +import java.util.logging.Logger; +import org.w3c.dom.DocumentFragment; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +// Nutch imports +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.HTMLMetaTags; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.HtmlParseFilter; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.util.StringUtil; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.util.LogFormatter; + + +/** + * Adds microformat rel-tags of document if found. + * + * @see <a class="www.microformats.org/wiki/rel-tag"> + * www.microformats.org/wiki/rel-tag</a> + * @author Jérôme Charron + */ +public class RelTagParser implements HtmlParseFilter { + + public final static Logger LOG = + LogFormatter.getLogger(RelTagParser.class.getName()); + + public final static String REL_TAG = "Rel-Tag"; + + + private Configuration conf = null; + + + /** + * Scan the HTML document looking at possible rel-tags + */ + public Parse filter(Content content, Parse parse, HTMLMetaTags metaTags, DocumentFragment doc) { + + // Trying to find the document's rel-tags + Parser parser = new Parser(doc); + Set tags = parser.getRelTags(); + Iterator iter = tags.iterator(); + Metadata metadata = parse.getData().getParseMeta(); + while (iter.hasNext()) { + metadata.add(REL_TAG, (String) iter.next()); + } + return parse; + } + + private static class Parser { + + Set tags = null; + + Parser(Node node) { + tags = new TreeSet(); + parse(node); + } + + Set getRelTags() { + return tags; + } + + void parse(Node node) { + + if (node.getNodeType() == Node.ELEMENT_NODE) { + // Look for <a> tag + if ("a".equalsIgnoreCase(node.getNodeName())) { + NamedNodeMap attrs = node.getAttributes(); + Node hrefNode = attrs.getNamedItem("href"); + // Checks that it contains a href attribute + if (hrefNode != null) { + Node relNode = attrs.getNamedItem("rel"); + // Checks that it contains a rel attribute too + if (relNode != null) { + // Finaly checks that rel=tag + if ("tag".equalsIgnoreCase(relNode.getNodeValue())) { + String tag = parseTag(hrefNode.getNodeValue()); + if (!StringUtil.isEmpty(tag)) { + tags.add(tag); + } + } + } + } + } + } + + // Recurse + NodeList children = node.getChildNodes(); + for (int i=0; children != null && i<children.getLength(); i++) { + parse(children.item(i)); + } + } + + private final static String parseTag(String url) { + String tag = null; + try { + URL u = new URL(url); + String path = u.getPath(); + tag = URLDecoder.decode(path.substring(path.lastIndexOf('/') + 1), "UTF-8"); + } catch (Exception e) { + // Malformed tag... + tag = null; + } + return tag; + } + + } + + + /* ----------------------------- * + * <implementation:Configurable> * + * ----------------------------- */ + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + /* ------------------------------ * + * </implementation:Configurable> * + * ------------------------------ */ + +} Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java?rev=382948&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java (added) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java Fri Mar 3 14:33:29 2006 @@ -0,0 +1,57 @@ +/** + * Copyright 2005 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.microformats.reltag; + +// Hadoop imports +import org.apache.hadoop.conf.Configuration; + +// Nutch imports +import org.apache.nutch.searcher.RawFieldQueryFilter; + + +/** + * Handles <code>"tag:"<code> query clauses. + * + * @see <a class="www.microformats.org/wiki/rel-tag"> + * www.microformats.org/wiki/rel-tag</a> + * @author Jérôme Charron + */ +public class RelTagQueryFilter extends RawFieldQueryFilter { + + private Configuration conf; + + public RelTagQueryFilter() { + super("tag", true, 1.0f); + } + + + /* ----------------------------- * + * <implementation:Configurable> * + * ----------------------------- */ + + public void setConf(Configuration conf) { + this.conf = conf; + } + + public Configuration getConf() { + return this.conf; + } + + /* ------------------------------ * + * </implementation:Configurable> * + * ------------------------------ */ + +} Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagQueryFilter.java ------------------------------------------------------------------------------ svn:eol-style = native Added: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html URL: svn.apache.org/viewcvs/lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html?rev=382948&view=auto ============================================================================== --- lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html (added) +++ lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html Fri Mar 3 14:33:29 2006 @@ -0,0 +1,8 @@ +<html> +<body> +<p> +A microformats <a class="www.microformats.org/wiki/Rel-Tag">Rel-Tag</a> +Parser/Indexer/Querier plugin. +</p> +</body> +</html> Propchange: lucene/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/package.html ------------------------------------------------------------------------------ svn:eol-style = native