本文主要是介绍Hadoop是Lucene的子项目,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
Hadoop是Lucene的子项目,现在发展如火如荼,如何利用Hadoop的分布式处理能力,来给Lucene提高建索引的效率呢,如此一来,便能充分利用HDFS的所有优点,但众所周知,HDFS系统,对随机读支持的并不友好,而像Lucene这种全文检索的框架,几乎所有的检索操作,都离不开随机读写的操作,那么如何才能使Lucene结合hadoop完美的工作呢,其实hadoop的版本里,在一个contrib的工具包里面,带了Lucene索引的工具类,不过貌似是用的人很少,散仙没有用过这个,在这里就不多评价了。
在solr4.4之后的项目,里面已经集成了像HDFS写入索引的jar包,如果你是在solr里面,那么很容易就能够,把索引建在HDFS上,只需要在solrconfig.xml里面配置Directory的实现类为HDFSDirectory即可,但是solr4.4里面的jar仅仅支持,最新版的hadoop,也就2.0之后的,直接在1.x的hadoop里使用,会出现异常,这是由于,2.x和1.x的hadoop的API变化,散仙改了部分源码后,可以支持对1.x的hadoop进行索引,查询操作,在文末,散仙会把这几个类,给上传上来,用时,只需把这几个类导入工程即可。
下面看下散仙的测试demo的源码:
- package indexhadoop;
- import hdfs.HdfsDirectory;
- import org.apache.hadoop.conf.Configuration;
- import org.apache.hadoop.fs.Path;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
- import org.apache.lucene.document.Document;
- import org.apache.lucene.document.Field.Store;
- import org.apache.lucene.document.StringField;
- import org.apache.lucene.document.TextField;
- import org.apache.lucene.index.DirectoryReader;
- import org.apache.lucene.index.IndexReader;
- import org.apache.lucene.index.IndexWriter;
- import org.apache.lucene.index.IndexWriterConfig;
- import org.apache.lucene.index.Term;
- import org.apache.lucene.queryparser.classic.QueryParser;
- import org.apache.lucene.search.IndexSearcher;
- import org.apache.lucene.search.Query;
- import org.apache.lucene.search.ScoreDoc;
- import org.apache.lucene.search.TopDocs;
- import org.apache.lucene.store.Directory;
- import org.apache.lucene.util.Version;
- /**
- *
- * @author qindongliang
- * 将索引存储在HDFS上的demo
- * 支持hadoop1.x的版本
- *
- * **/
- public class MyIndex {
- public static void main(String[] args)throws Exception {
- //long a=System.currentTimeMillis();
- //add();
- // long b=System.currentTimeMillis();
- // System.out.println("耗时: "+(b-a)+"毫秒");
- query("中国");
- //delete("3");//删除指定ID的数据
- }
- /***
- * 得到HDFS的writer
- *
- * **/
- public static IndexWriter getIndexWriter() throws Exception{
- Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);
- IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_46, analyzer);
- Configuration conf=new Configuration();
- //Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
- //Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");
- Path path=new Path("hdfs://192.168.75.130:9000/root/index");
- HdfsDirectory directory=new HdfsDirectory(path, conf);
- IndexWriter writer=new IndexWriter(directory, config);
- return writer;
- }
- /**
- * 建索引的方法
- *
- * **/
- public static void add()throws Exception{
- IndexWriter writer=getIndexWriter();
- // doc.add(new StringField("id", "3", Store.YES));
- // doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES));
- // doc.add(new TextField("content", "今天发工资了吗", Store.YES));
- // Document doc2=new Document();
- // doc.add(new StringField("id", "4", Store.YES));
- // doc2.add(new StringField("name", "今天天气不错呀", Store.YES));
- // doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES));
- // Document doc3=new Document();
- // doc3.add(new StringField("id", "5", Store.YES));
- // doc3.add(new StringField("name", "没有根的野草,飘忽的命途!", Store.YES));
- // doc3.add(new TextField("content", "你工资多少呀!", Store.YES));
- // writer.addDocument(doc);
- // writer.addDocument(doc2);
- // writer.addDocument(doc3);
- for(int i=6;i<10000;i++){
- Document doc=new Document();
- doc.add(new StringField("id", i+"", Store.YES));
- doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES));
- doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES));
- writer.addDocument(doc);
- if(i%1000==0){
- writer.commit();
- }
- }
- writer.forceMerge(1);
- writer.commit();
- System.out.println("索引10000条数据添加成功!");
- writer.close();
- }
- /***
- * 添加索引
- *
- * **/
- public static void add(Document d)throws Exception{
- IndexWriter writer=getIndexWriter();
- writer.addDocument(d);
- writer.forceMerge(1);
- writer.commit();
- System.out.println("索引10000条数据添加成功!");
- writer.close();
- }
- /**
- * 根据指定ID
- * 删除HDFS上的一些数据
- *
- *
- * **/
- public static void delete(String id)throws Exception{
- IndexWriter writer=getIndexWriter();
- writer.deleteDocuments(new Term("id", id));//删除指定ID的数据
- writer.forceMerge(1);//清除已经删除的索引空间
- writer.commit();//提交变化
- System.out.println("id为"+id+"的数据已经删除成功.........");
- }
- /**
- * 检索的方法
- *
- * **/
- public static void query(String queryTerm)throws Exception{
- System.out.println("本次检索内容: "+queryTerm);
- Configuration conf=new Configuration();
- //Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");
- // Path path=new Path("hdfs://192.168.75.130:9000/root/index");
- Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");
- Directory directory=new HdfsDirectory(path, conf);
- IndexReader reader=DirectoryReader.open(directory);
- System.out.println("总数据量: "+reader.numDocs());
- long a=System.currentTimeMillis();
- IndexSearcher searcher=new IndexSearcher(reader);
- QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46));
- Query query=parse.parse(queryTerm);
- TopDocs docs=searcher.search(query, 100);
- System.out.println("本次命中结果: "+docs.totalHits+" 条" );
- // for(ScoreDoc sc:docs.scoreDocs){
- //
- // System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content"));
- //
- // }
- long b=System.currentTimeMillis();
- System.out.println("第一次耗时:"+(b-a)+" 毫秒");
- System.out.println("============================================");
- long c=System.currentTimeMillis();
- query=parse.parse(queryTerm);
- docs=searcher.search(query, 100);
- System.out.println("本次命中结果: "+docs.totalHits+" 条" );
- // for(ScoreDoc sc:docs.scoreDocs){
- //
- // System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content"));
- //
- // }
- long d=System.currentTimeMillis();
- System.out.println("第二次耗时:"+(d-c)+" 毫秒");
- reader.close();
- directory.close();
- System.out.println("检索完毕...............");
- }
- }
package indexhadoop;import hdfs.HdfsDirectory;import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;/*** * @author qindongliang* 将索引存储在HDFS上的demo* 支持hadoop1.x的版本* * **/
public class MyIndex {public static void main(String[] args)throws Exception {//long a=System.currentTimeMillis();//add();// long b=System.currentTimeMillis();// System.out.println("耗时: "+(b-a)+"毫秒");query("中国");//delete("3");//删除指定ID的数据}/**** 得到HDFS的writer* * **/public static IndexWriter getIndexWriter() throws Exception{Analyzer analyzer=new SmartChineseAnalyzer(Version.LUCENE_46);IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_46, analyzer);Configuration conf=new Configuration();//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");//Path path=new Path("hdfs://10.2.143.5:9090/root/myfile");Path path=new Path("hdfs://192.168.75.130:9000/root/index");HdfsDirectory directory=new HdfsDirectory(path, conf);IndexWriter writer=new IndexWriter(directory, config);return writer;}/*** 建索引的方法* * **/public static void add()throws Exception{IndexWriter writer=getIndexWriter(); // doc.add(new StringField("id", "3", Store.YES));
// doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架", Store.YES));
// doc.add(new TextField("content", "今天发工资了吗", Store.YES));
// Document doc2=new Document();
// doc.add(new StringField("id", "4", Store.YES));
// doc2.add(new StringField("name", "今天天气不错呀", Store.YES));
// doc2.add(new TextField("content", "钱存储在银行靠谱吗", Store.YES));
// Document doc3=new Document();
// doc3.add(new StringField("id", "5", Store.YES));
// doc3.add(new StringField("name", "没有根的野草,飘忽的命途!", Store.YES));
// doc3.add(new TextField("content", "你工资多少呀!", Store.YES));
// writer.addDocument(doc);
// writer.addDocument(doc2);
// writer.addDocument(doc3);for(int i=6;i<10000;i++){Document doc=new Document();doc.add(new StringField("id", i+"", Store.YES));doc.add(new StringField("name", "lucene是一款非常优秀的全文检索框架"+i, Store.YES));doc.add(new TextField("content", "今天发工资了吗"+i, Store.YES));writer.addDocument(doc);if(i%1000==0){writer.commit();}}writer.forceMerge(1);writer.commit();System.out.println("索引10000条数据添加成功!");writer.close();}/**** 添加索引* * **/public static void add(Document d)throws Exception{IndexWriter writer=getIndexWriter(); writer.addDocument(d);writer.forceMerge(1);writer.commit();System.out.println("索引10000条数据添加成功!");writer.close();}/*** 根据指定ID* 删除HDFS上的一些数据* * * **/public static void delete(String id)throws Exception{IndexWriter writer=getIndexWriter();writer.deleteDocuments(new Term("id", id));//删除指定ID的数据writer.forceMerge(1);//清除已经删除的索引空间writer.commit();//提交变化System.out.println("id为"+id+"的数据已经删除成功.........");}/*** 检索的方法 * * **/public static void query(String queryTerm)throws Exception{System.out.println("本次检索内容: "+queryTerm);Configuration conf=new Configuration();//Path p1 =new Path("hdfs://10.2.143.5:9090/root/myfile/my.txt");// Path path=new Path("hdfs://192.168.75.130:9000/root/index");Path path=new Path("hdfs://192.168.75.130:9000/root/output/map1");Directory directory=new HdfsDirectory(path, conf);IndexReader reader=DirectoryReader.open(directory);System.out.println("总数据量: "+reader.numDocs());long a=System.currentTimeMillis();IndexSearcher searcher=new IndexSearcher(reader);QueryParser parse=new QueryParser(Version.LUCENE_46, "city", new SmartChineseAnalyzer(Version.LUCENE_46));Query query=parse.parse(queryTerm);TopDocs docs=searcher.search(query, 100);System.out.println("本次命中结果: "+docs.totalHits+" 条" );
// for(ScoreDoc sc:docs.scoreDocs){
//
// System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content"));
//
// }long b=System.currentTimeMillis();System.out.println("第一次耗时:"+(b-a)+" 毫秒");System.out.println("============================================");long c=System.currentTimeMillis();query=parse.parse(queryTerm);docs=searcher.search(query, 100);System.out.println("本次命中结果: "+docs.totalHits+" 条" );
// for(ScoreDoc sc:docs.scoreDocs){
//
// System.out.println("评分: "+sc.score+" id : "+searcher.doc(sc.doc).get("id")+" name: "+searcher.doc(sc.doc).get("name")+" 字段内容: "+searcher.doc(sc.doc).get("content"));
//
// }long d=System.currentTimeMillis();System.out.println("第二次耗时:"+(d-c)+" 毫秒");reader.close();directory.close();System.out.println("检索完毕...............");}}
上面是散仙测试的例子,经测试,对HDFS上的lucene索引的增删改查都没问题,但有一点需要注意,lucene结合hadoop,确实能大大提升建索引的速度,但是在检索上却没有任何优势,虽然也可以检索,但是速度比较慢,目前的存储实现,是利用了block cache的缓存特性,能使得检索性能差强人意,但是数据量大的时候,检索性能非常糟糕,这一点到现在还没有任何比较好的解决方法,除非,以后给lucene,或solr,增加类似Hbase的数据结构,如此以来,检索上可能会好很多。
上面的代码能够将索引,写入1.x的hadoop中,后续,散仙会给出,在hadoop2.x中建索引的例子,以及如何使用MapReduce并行建索引。
这篇关于Hadoop是Lucene的子项目的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!