读书人

应用SolrJ生成索引

发布时间: 2012-08-22 09:50:35 作者: rapoo

使用SolrJ生成索引
代码很简单, 直接看就明白了, 可以在实际工作中借鉴, 原文在这里. 这个例子使用两种方式来演示如何生成全量索引:
一个是从db中通过sql生成全量索引
一个是通过tika解析文件生成全量索引

package SolrJExample;import org.apache.solr.client.solrj.SolrServerException;import org.apache.solr.client.solrj.impl.StreamingUpdateSolrServer;import org.apache.solr.client.solrj.impl.XMLResponseParser;import org.apache.solr.client.solrj.response.UpdateResponse;import org.apache.solr.common.SolrInputDocument;import org.apache.tika.metadata.Metadata;import org.apache.tika.parser.AutoDetectParser;import org.apache.tika.parser.ParseContext;import org.apache.tika.sax.BodyContentHandler;import org.xml.sax.ContentHandler;import java.io.File;import java.io.FileInputStream;import java.io.IOException;import java.io.InputStream;import java.sql.*;import java.util.ArrayList;import java.util.Collection;/* Example class showing the skeleton of using Tika and   Sql on the client to index documents from   both structured documents and a SQL database.   NOTE: The SQL example and the Tika example are entirely orthogonal.   Both are included here to make a   more interesting example, but you can omit either of them. */public class SqlTikaExample {  private StreamingUpdateSolrServer _server;  private long _start = System.currentTimeMillis();  private AutoDetectParser _autoParser;  private int _totalTika = 0;  private int _totalSql = 0;  private Collection _docs = new ArrayList();  public static void main(String[] args) {    try {      SqlTikaExample idxer = new SqlTikaExample("http://localhost:8983/solr");      idxer.doTikaDocuments(new File("/Users/Erick/testdocs"));      idxer.doSqlDocuments();      idxer.endIndexing();    } catch (Exception e) {      e.printStackTrace();    }  }  private SqlTikaExample(String url) throws IOException, SolrServerException {      // Create a multi-threaded communications channel to the Solr server.      // Could be CommonsHttpSolrServer as well.      //    _server = new StreamingUpdateSolrServer(url, 10, 4);    _server.setSoTimeout(1000);  // socket read timeout    _server.setConnectionTimeout(1000);    _server.setMaxRetries(1); // defaults to 0.  > 1 not recommended.         // binary parser is used by default for responses    _server.setParser(new XMLResponseParser());       // One of the ways Tika can be used to attempt to parse arbitrary files.    _autoParser = new AutoDetectParser();  }    // Just a convenient place to wrap things up.  private void endIndexing() throws IOException, SolrServerException {    if (_docs.size() > 0) { // Are there any documents left over?      _server.add(_docs, 300000); // Commit within 5 minutes    }    _server.commit(); // Only needs to be done at the end,                      // commitWithin should do the rest.                      // Could even be omitted                      // assuming commitWithin was specified.    long endTime = System.currentTimeMillis();    log("Total Time Taken: " + (endTime - _start) +         " milliseconds to index " + _totalSql +        " SQL rows and " + _totalTika + " documents");  }  // I hate writing System.out.println() everyplace,  // besides this gives a central place to convert to true logging  // in a production system.  private static void log(String msg) {    System.out.println(msg);  }  /**   * ***************************Tika processing here   */  // Recursively traverse the filesystem, parsing everything found.  private void doTikaDocuments(File root) throws IOException, SolrServerException {    // Simple loop for recursively indexing all the files    // in the root directory passed in.    for (File file : root.listFiles()) {      if (file.isDirectory()) {        doTikaDocuments(file);        continue;      }        // Get ready to parse the file.      ContentHandler textHandler = new BodyContentHandler();      Metadata metadata = new Metadata();      ParseContext context = new ParseContext();      InputStream input = new FileInputStream(file);        // Try parsing the file. Note we haven't checked at all to        // see whether this file is a good candidate.      try {        _autoParser.parse(input, textHandler, metadata, context);      } catch (Exception e) {          // Needs better logging of what went wrong in order to          // track down "bad" documents.        log(String.format("File %s failed", file.getCanonicalPath()));        e.printStackTrace();        continue;      }      // Just to show how much meta-data and what form it's in.      dumpMetadata(file.getCanonicalPath(), metadata);      // Index just a couple of the meta-data fields.      SolrInputDocument doc = new SolrInputDocument();      doc.addField("id", file.getCanonicalPath());      // Crude way to get known meta-data fields.      // Also possible to write a simple loop to examine all the      // metadata returned and selectively index it and/or      // just get a list of them.      // One can also use the LucidWorks field mapping to      // accomplish much the same thing.      String author = metadata.get("Author");      if (author != null) {        doc.addField("author", author);      }      doc.addField("text", textHandler.toString());      _docs.add(doc);      ++_totalTika;      // Completely arbitrary, just batch up more than one document      // for throughput!      if (_docs.size() >= 1000) {          // Commit within 5 minutes.        UpdateResponse resp = _server.add(_docs, 300000);        if (resp.getStatus() != 0) {          log("Some horrible error has occurred, status is: " +                  resp.getStatus());        }        _docs.clear();      }    }  }    // Just to show all the metadata that's available.  private void dumpMetadata(String fileName, Metadata metadata) {    log("Dumping metadata for file: " + fileName);    for (String name : metadata.names()) {      log(name + ":" + metadata.get(name));    }    log("\n\n");  }  /**   * ***************************SQL processing here   */  private void doSqlDocuments() throws SQLException {    Connection con = null;    try {      Class.forName("com.mysql.jdbc.Driver").newInstance();      log("Driver Loaded......");      con = DriverManager.getConnection("jdbc:mysql://192.168.1.103:3306/test?"                + "user=testuser&password=test123");      Statement st = con.createStatement();      ResultSet rs = st.executeQuery("select id,title,text from test");      while (rs.next()) {        // DO NOT move this outside the while loop        // or be sure to call doc.clear()        SolrInputDocument doc = new SolrInputDocument();         String id = rs.getString("id");        String title = rs.getString("title");        String text = rs.getString("text");        doc.addField("id", id);        doc.addField("title", title);        doc.addField("text", text);        _docs.add(doc);        ++_totalSql;        // Completely arbitrary, just batch up more than one        // document for throughput!        if (_docs.size() > 1000) {             // Commit within 5 minutes.          UpdateResponse resp = _server.add(_docs, 300000);          if (resp.getStatus() != 0) {            log("Some horrible error has occurred, status is: " +                  resp.getStatus());          }          _docs.clear();        }      }    } catch (Exception ex) {      ex.printStackTrace();    } finally {      if (con != null) {        con.close();      }    }  }}
1 楼 huangfoxAgain 2012-02-25 请问在3.x的solr中怎么解决“实时检索”的问题呢?
这里采用的应该是hard commit~ 2 楼 macrochen 2012-02-27 huangfoxAgain 写道请问在3.x的solr中怎么解决“实时检索”的问题呢?
这里采用的应该是hard commit~

这里是针对全量索引.

实时检索可以参考Sensei

然后定时或者主动更新索引

读书人网 >软件架构设计

热点推荐