Nutch1.2增加插件例子[转]
今尝试下给nutch1.2增加一个插件,于是到官网找了个例子,链接如下:
http://wiki.apache.org/nutch/WritingPluginExample-0.9
这个例子实现的的是推荐网站,就是写关键字在content里,当别人搜索这个关键字时,你推荐的网站在搜索结果中排前,要实现推荐必须在你的网页上加上
view plaincopy to clipboardprint?
<meta name="recommended" content="plugins" />?
<meta name="recommended" content="plugins" />
这条属性才能被插件识别。
由于它这个例子是用nutch0.9的,而且1.2和0.9有些区别,于是要修改一些代码。步骤如下:
1.插件开放
1.1在src/plugin中新建一个文件夹recommend
1.2.在recommend目录下新建Plugin.xml和Build.xml文件,内容如下:
Plugin.xml
view plaincopy to clipboardprint?
<?xml version="1.0" encoding="UTF-8"?>?
<plugin?
?? id="recommended"?
?? name="Recommended Parser/Filter"?
?? version="0.0.1"?
?? provider-name="nutch.org">?
?
?? <runtime>?
????? <!-- As defined in build.xml this plugin will end up bundled as recommended.jar -->?
????? <library name="recommended.jar">?
???????? <export name="*"/>?
????? </library>?
?? </runtime>?
?
?? <!-- The RecommendedParser extends the HtmlParseFilter to grab the contents of??
??????? any recommended meta tags -->?
?? <extension id="org.apache.nutch.parse.recommended.recommendedfilter"?
????????????? name="Recommended Parser"?
????????????? point="org.apache.nutch.parse.HtmlParseFilter">?
????? <implementation id="RecommendedParser"?
????????????????????? value="recommended"/>?
??????? </implementation>?
?? </extension>?
?
</plugin>?
<?xml version="1.0" encoding="UTF-8"?>
<plugin
?? id="recommended"
?? name="Recommended Parser/Filter"
?? version="0.0.1"
?? provider-name="nutch.org">
?? <runtime>
????? <!-- As defined in build.xml this plugin will end up bundled as recommended.jar -->
????? <library name="recommended.jar">
???????? <export name="*"/>
????? </library>
?? </runtime>
?? <!-- The RecommendedParser extends the HtmlParseFilter to grab the contents of
??????? any recommended meta tags -->
?? <extension id="org.apache.nutch.parse.recommended.recommendedfilter"
????????????? name="Recommended Parser"
????????????? point="org.apache.nutch.parse.HtmlParseFilter">
????? <implementation id="RecommendedParser"
????????????????????? value="recommended"/>
??????? </implementation>
?? </extension>
</plugin>
Build.xml
view plaincopy to clipboardprint?
<?xml version="1.0"?>?
?
<project name="recommended" default="jar-core">?
?
? <import file="../build-plugin.xml"/>?
????
?<!-- Build compilation dependencies -->?
?<target name="deps-jar">?
?? <ant target="jar" inheritall="false" dir="../lib-xml"/>?
?</target>?
?
? <!-- Add compilation dependencies to classpath -->?
?<path id="plugin.deps">?
?? <fileset dir="${nutch.root}/build">?
???? <include name="**/lib-xml/*.jar" />?
?? </fileset>?
?</path>?
?
? <!-- Deploy Unit test dependencies -->?
?<target name="deps-test">?
?? <ant target="deploy" inheritall="false" dir="../lib-xml"/>?
?? <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>?
?? <ant target="deploy" inheritall="false" dir="../protocol-file"/>?
?</target>?
?
???
? <!-- for junit test -->?
? <mkdir dir="${build.test}/data"/>?
? <copy file="data/recommended.html" todir="${build.test}/data"/>?
</project>?
<?xml version="1.0"?>
<project name="recommended" default="jar-core">
? <import file="../build-plugin.xml"/>
?
?<!-- Build compilation dependencies -->
?<target name="deps-jar">
?? <ant target="jar" inheritall="false" dir="../lib-xml"/>
?</target>
? <!-- Add compilation dependencies to classpath -->
?<path id="plugin.deps">
?? <fileset dir="${nutch.root}/build">
???? <include name="**/lib-xml/*.jar" />
?? </fileset>
?</path>
? <!-- Deploy Unit test dependencies -->
?<target name="deps-test">
?? <ant target="deploy" inheritall="false" dir="../lib-xml"/>
?? <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
?? <ant target="deploy" inheritall="false" dir="../protocol-file"/>
?</target>
?
? <!-- for junit test -->
? <mkdir dir="${build.test}/data"/>
? <copy file="data/recommended.html" todir="${build.test}/data"/>
</project>
1.3.在recommended目录下建立\src\java\org\apache\nutch\parse\recommended目录。
1.4.增加RecommendedIndexer.java,RecommendedParser.java,RecommendedQueryFilter.java三个类,内容如下:
RecommendedIndexer.java
view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;??
?
// JDK import??
import java.util.logging.Logger;??
?
// Commons imports??
import org.apache.commons.logging.Log;??
import org.apache.commons.logging.LogFactory;??
?
?
// Nutch imports??
import org.apache.nutch.util.LogUtil;??
import org.apache.nutch.fetcher.FetcherOutput;??
import org.apache.nutch.indexer.IndexingFilter;??
import org.apache.nutch.indexer.IndexingException;??
import org.apache.nutch.indexer.NutchDocument;??
import org.apache.nutch.parse.Parse;??
?
import org.apache.hadoop.conf.Configuration;??
import org.apache.hadoop.io.Text;??
import org.apache.nutch.crawl.CrawlDatum;??
import org.apache.nutch.crawl.Inlinks;??
?
// Lucene imports??
import org.apache.lucene.document.Field;??
import org.apache.lucene.document.Document;??
?
public class RecommendedIndexer implements IndexingFilter {??
??????
? public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());??
????
? private Configuration conf;??
????
? public RecommendedIndexer() {??
? }??
? @Override?
? public NutchDocument filter(NutchDocument doc, Parse parse, Text url,???
??? CrawlDatum datum, Inlinks inlinks)??
??? throws IndexingException {??
?
??? String recommendation = parse.getData().getMeta("recommended");??
?
??????? if (recommendation != null) {??
??????????? Field recommendedField =???
??????????????? new Field("recommended", recommendation,???
??????????????????? Field.Store.YES, Field.Index.NOT_ANALYZED);??
??????????? recommendedField.setBoost(5.0f);??
??????????? doc.add("recommended",recommendedField);??
??????????? LOG.info("Added " + recommendation + " to the recommended Field");??
??????? }??
?
??? return doc;??
? }??
????
? public void setConf(Configuration conf) {??
??? this.conf = conf;??
? }??
?
? public Configuration getConf() {??
??? return this.conf;??
? }??
?
@Override?
public void addIndexBackendOptions(Configuration conf) {??
??? // TODO Auto-generated method stub??
}??
}?
package org.apache.nutch.parse.recommended;
// JDK import
import java.util.logging.Logger;
// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// Nutch imports
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.fetcher.FetcherOutput;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.parse.Parse;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
// Lucene imports
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Document;
public class RecommendedIndexer implements IndexingFilter {
???
? public static final Log LOG = LogFactory.getLog(RecommendedIndexer.class.getName());
?
? private Configuration conf;
?
? public RecommendedIndexer() {
? }
? @Override
? public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
??? CrawlDatum datum, Inlinks inlinks)
??? throws IndexingException {
??? String recommendation = parse.getData().getMeta("recommended");
??????? if (recommendation != null) {
??????????? Field recommendedField =
??????????????? new Field("recommended", recommendation,
??????????????????? Field.Store.YES, Field.Index.NOT_ANALYZED);
??????????? recommendedField.setBoost(5.0f);
??????????? doc.add("recommended",recommendedField);
??????????? LOG.info("Added " + recommendation + " to the recommended Field");
??????? }
??? return doc;
? }
?
? public void setConf(Configuration conf) {
??? this.conf = conf;
? }
? public Configuration getConf() {
??? return this.conf;
? }
@Override
public void addIndexBackendOptions(Configuration conf) {
?// TODO Auto-generated method stub
}
}
?
RecommendedParser.java
view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;??
?
// JDK imports??
import java.util.Enumeration;??
import java.util.Properties;??
import java.util.logging.Logger;??
?
// Nutch imports??
import org.apache.hadoop.conf.Configuration;??
import org.apache.nutch.metadata.Metadata;??
import org.apache.nutch.parse.HTMLMetaTags;??
import org.apache.nutch.parse.Parse;??
import org.apache.nutch.parse.HtmlParseFilter;??
import org.apache.nutch.parse.ParseResult;??
import org.apache.nutch.protocol.Content;??
?
// Commons imports??
import org.apache.commons.logging.Log;??
import org.apache.commons.logging.LogFactory;??
?
// W3C imports??
import org.w3c.dom.DocumentFragment;??
?
public class RecommendedParser implements HtmlParseFilter {??
?
? private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());??
????
? private Configuration conf;??
?
? /** The Recommended meta data attribute name */?
? public static final String META_RECOMMENDED_NAME="recommended";??
?
? /**?
?? * Scan the HTML document looking for a recommended meta tag.?
?? */?
????
? @Override?
? public ParseResult filter(Content content, ParseResult parseResult,??
??? HTMLMetaTags metaTags, DocumentFragment doc) {??
??? // Trying to find the document's recommended term??
??? String recommendation = null;??
?
??? Properties generalMetaTags = metaTags.getGeneralTags();??
?
??? for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {??
??????? if (tagNames.nextElement().equals("recommended")) {??
??????????? System.out.println(generalMetaTags.getProperty("recommended"));??
??????????? recommendation = generalMetaTags.getProperty("recommended");??
?????????? LOG.info("Found a Recommendation for " + recommendation);??
??????? }??
??? }??
?
??? if (recommendation == null) {??
??????? LOG.info("No Recommendation");??
??? } else {??
??????? LOG.info("Adding Recommendation for " + recommendation);??
??????? Parse parse = parseResult.get(content.getUrl());??
??????????
??????? parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);??
??? }??
?
??? return parseResult;??
? }??
????
? public void setConf(Configuration conf) {??
??? this.conf = conf;??
? }??
?
? public Configuration getConf() {??
??? return this.conf;??
? }??
?
?
?
}?
package org.apache.nutch.parse.recommended;
// JDK imports
import java.util.Enumeration;
import java.util.Properties;
import java.util.logging.Logger;
// Nutch imports
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
// W3C imports
import org.w3c.dom.DocumentFragment;
public class RecommendedParser implements HtmlParseFilter {
? private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());
?
? private Configuration conf;
? /** The Recommended meta data attribute name */
? public static final String META_RECOMMENDED_NAME="recommended";
? /**
?? * Scan the HTML document looking for a recommended meta tag.
?? */
?
? @Override
? public ParseResult filter(Content content, ParseResult parseResult,
??? HTMLMetaTags metaTags, DocumentFragment doc) {
??? // Trying to find the document's recommended term
??? String recommendation = null;
??? Properties generalMetaTags = metaTags.getGeneralTags();
??? for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames.hasMoreElements(); ) {
??????? if (tagNames.nextElement().equals("recommended")) {
??????????? System.out.println(generalMetaTags.getProperty("recommended"));
??????? ?recommendation = generalMetaTags.getProperty("recommended");
?????????? LOG.info("Found a Recommendation for " + recommendation);
??????? }
??? }
??? if (recommendation == null) {
??????? LOG.info("No Recommendation");
??? } else {
??????? LOG.info("Adding Recommendation for " + recommendation);
??????? Parse parse = parseResult.get(content.getUrl());
???????
??????? parse.getData().getContentMeta().set(META_RECOMMENDED_NAME, recommendation);
??? }
??? return parseResult;
? }
?
? public void setConf(Configuration conf) {
??? this.conf = conf;
? }
? public Configuration getConf() {
??? return this.conf;
? }
?
}
RecommendedQueryFilter.java
view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;??
?
import org.apache.nutch.searcher.FieldQueryFilter;??
?
import java.util.logging.Logger;??
?
// Commons imports??
import org.apache.commons.logging.Log;??
import org.apache.commons.logging.LogFactory;??
?
?
public class RecommendedQueryFilter extends FieldQueryFilter {??
??? private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());??
?
??? public RecommendedQueryFilter() {??
??????? super("recommended", 5f);??
??????? LOG.info("Added a recommended query");??
??? }??
????
}?
package org.apache.nutch.parse.recommended;
import org.apache.nutch.searcher.FieldQueryFilter;
import java.util.logging.Logger;
// Commons imports
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
public class RecommendedQueryFilter extends FieldQueryFilter {
??? private static final Log LOG = LogFactory.getLog(RecommendedParser.class.getName());
??? public RecommendedQueryFilter() {
??????? super("recommended", 5f);
??????? LOG.info("Added a recommended query");
??? }
?
}
1.5.在 src/plugin/build.xml 中的<target name="deploy"></target>中增加一行:
view plaincopy to clipboardprint?
<ant dir="recommended" target="deploy" />?
<ant dir="recommended" target="deploy" />
1.6.运行cmd,切换到recommend目录,运行ant命令编译,插件开发完成。
1.7 让nutch识别你的插件
????? 在conf/nutch-site.xml 中增加一下属性
view plaincopy to clipboardprint?
<property>??
? <name>plugin.includes</name>??
? <value>recommended|protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>? <description>Regular expression naming plugin id names to??
? include.? Any plugin not matching this expression is excluded.??
? In any case you need at least include the nutch-extensionpoints plugin. By??
? default Nutch includes crawling just HTML and plain text via HTTP,??
? and basic indexing and search plugins.??
? </description>??
</property>?
<property>
? <name>plugin.includes</name>
? <value>recommended|protocol-http|urlfilter-regex|parse-(text|html|js)|index-basic|query-(basic|site|url)|summary-basic|scoring-opic|urlnormalizer-(pass|regex|basic)</value>? <description>Regular expression naming plugin id names to
? include.? Any plugin not matching this expression is excluded.
? In any case you need at least include the nutch-extensionpoints plugin. By
? default Nutch includes crawling just HTML and plain text via HTTP,
? and basic indexing and search plugins.
? </description>
</property>
2.编写插件测试类
2.1 在src/plugin中/recommend目录下新建一个data目录,在data目录下新建一个html文件recommended.html内容如下:
view plaincopy to clipboardprint?
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">?
?
<html lang="en">?
<head>?
??? <meta http-equiv="Content-Type" content="text/html; charset=utf-8">?
??? <title>recommended</title>?
??? <meta name="generator" content="TextMate http://macromates.com/">?
??? <meta name="author" content="Ricardo J. Méndez">?
??? <meta name="recommended" content="recommended-content"/>?
??? <!-- Date: 2007-02-12 -->?
</head>?
<body>?
??? Recommended meta tag test.??
</body>?
</html>?
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN">
<html lang="en">
<head>
??? <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
??? <title>recommended</title>
??? <meta name="generator" content="TextMate http://macromates.com/">
??? <meta name="author" content="Ricardo J. Méndez">
??? <meta name="recommended" content="recommended-content"/>
??? <!-- Date: 2007-02-12 -->
</head>
<body>
??? Recommended meta tag test.
</body>
</html>
2.2 在src/plugin中/recommend目录下新建src/test/org/apache/nutch/parse/recommended目录,增加TestRecommendedParser.java类,内容如下:
view plaincopy to clipboardprint?
package org.apache.nutch.parse.recommended;??
?
?
import org.apache.nutch.metadata.Metadata;??
import org.apache.nutch.parse.Parse;??
import org.apache.nutch.parse.ParseResult;??
import org.apache.nutch.parse.ParseUtil;??
import org.apache.nutch.protocol.Content;??
import org.apache.hadoop.conf.Configuration;??
import org.apache.nutch.util.NutchConfiguration;??
?
import java.util.Properties;??
import java.io.*;??
import java.net.URL;??
?
import junit.framework.TestCase;??
?
/*??
?* Loads test page recommended.html and verifies that the recommended???
?* meta tag has recommended-content as its value.??
?*??
?*/??
public class TestRecommendedParser extends TestCase {??
?
? private static final File testDir =??
??? new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");??
?
? public void testPages() throws Exception {??
??? pageTest(new File(testDir, "recommended.html"), "http://foo.com/",??
???????????? "recommended-content");??
?
? }??
?
?
? public void pageTest(File file, String url, String recommendation)??
??? throws Exception {??
?
??? String contentType = "text/html";??
??? InputStream in = new FileInputStream(file);??
??????
??? ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());??
??? byte[] buffer = new byte[1024];??
??? int i;??
??? while ((i = in.read(buffer)) != -1) {??
????? out.write(buffer, 0, i);??
??? }??
??? in.close();??
??? byte[] bytes = out.toByteArray();??
??? Configuration conf = NutchConfiguration.create();??
?
??? Content content =??
????? new Content(url, url, bytes, contentType, new Metadata(), conf);??
??????
??? Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());??
??????
??? Metadata metadata = parse.getData().getContentMeta();??
????
??? assertEquals(recommendation, metadata.get("recommended"));??
??? assertTrue("somesillycontent" != metadata.get("recommended"));??
? }??
????
}?
package org.apache.nutch.parse.recommended;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.util.NutchConfiguration;
import java.util.Properties;
import java.io.*;
import java.net.URL;
import junit.framework.TestCase;
/*
?* Loads test page recommended.html and verifies that the recommended
?* meta tag has recommended-content as its value.
?*
?*/
public class TestRecommendedParser extends TestCase {
? private static final File testDir =
??? new File("H:/project/SearchEngine/Nutch1.2/src/plugin/recommended/data");
? public void testPages() throws Exception {
??? pageTest(new File(testDir, "recommended.html"), "http://foo.com/",
???????????? "recommended-content");
? }
? public void pageTest(File file, String url, String recommendation)
??? throws Exception {
??? String contentType = "text/html";
??? InputStream in = new FileInputStream(file);
???
??? ByteArrayOutputStream out = new ByteArrayOutputStream((int)file.length());
??? byte[] buffer = new byte[1024];
??? int i;
??? while ((i = in.read(buffer)) != -1) {
????? out.write(buffer, 0, i);
??? }
??? in.close();
??? byte[] bytes = out.toByteArray();
??? Configuration conf = NutchConfiguration.create();
??? Content content =
????? new Content(url, url, bytes, contentType, new Metadata(), conf);
???
??? Parse parse = new ParseUtil(conf).parseByExtensionId("parse-html",content).get(content.getUrl());
???
??? Metadata metadata = parse.getData().getContentMeta();
?
??? assertEquals(recommendation, metadata.get("recommended"));
??? assertTrue("somesillycontent" != metadata.get("recommended"));
? }
?
}
2.3 用junit运行TestRecommendedParser.java测试。
?
本文来自CSDN博客,转载请标明出处:http://blog.csdn.net/laigood12345/archive/2010/10/09/5929388.aspx
?
更多实例:http://www.lsoba.cn