Lucene笔记一
Lucene就是一個(gè)全文檢索的工具,建立索引用的,類似于新華字典的目錄
這里使用的是lucene-4.4.0版本,入門代碼所需jar包如下圖所示(解壓lucene-4.4.0后的目錄):
入門代碼:
import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexableField; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.junit.Test;/*8* luceneDemo* */ public class TestLucene {/*** 通過lucene 提供的api 對(duì)數(shù)據(jù)建立索引,indexWriter* @throws IOException * */@Testpublic void testAdd() throws IOException{//索引在硬盤上面存放的位置..Directory directory=FSDirectory.open(new File("D:/INDEX"));//lucene 當(dāng)前使用的版本...Version matchVersion=Version.LUCENE_44;//分詞器...(把一段文本分詞)(黑馬程序員是高端的培訓(xùn)機(jī)構(gòu))//analzyer 是一個(gè)抽象類,具體的切分詞規(guī)則由子類實(shí)現(xiàn)...Analyzer analyzer=new StandardAnalyzer(matchVersion);IndexWriterConfig config=new IndexWriterConfig(matchVersion, analyzer);//構(gòu)造索引寫入的對(duì)象..IndexWriter indexWriter=new IndexWriter(directory, config);//往索引庫(kù)里面寫數(shù)據(jù)..//索引庫(kù)里面的數(shù)據(jù)都是document 一個(gè)document相當(dāng)于是一條記錄//這個(gè)document里面的數(shù)據(jù)相當(dāng)于索引結(jié)構(gòu)..Document document=new Document();IndexableField indexableField=new IntField("id",1, Store.YES);IndexableField stringfield=new StringField("title","對(duì)王召廷的個(gè)人評(píng)價(jià)",Store.YES);IndexableField teIndexableField=new TextField("content","風(fēng)流倜儻有點(diǎn)黃",Store.YES);document.add(indexableField);document.add(stringfield);document.add(teIndexableField);//索引庫(kù)里面接收的數(shù)據(jù)都是document對(duì)象 indexWriter.addDocument(document);indexWriter.close();}/*** 對(duì)建立的索引進(jìn)行搜索...* 通過indexSearcher 去搜索...* @throws IOException */@Testpublic void testSearcher() throws IOException{//索引在硬盤上面存放的位置..Directory directory=FSDirectory.open(new File("D:/INDEX"));//把索引目錄里面的索引讀取到IndexReader 當(dāng)中...IndexReader indexReader=DirectoryReader.open(directory); // /構(gòu)造搜索索引的對(duì)象..IndexSearcher indexSearcher=new IndexSearcher(indexReader);//Query 它是一個(gè)查詢條件對(duì)象,它是一個(gè)抽象類,不同的查詢規(guī)則就構(gòu)造不同的子類...Query query=new TermQuery(new Term("title", "對(duì)王召廷的個(gè)人評(píng)價(jià)"));//檢索符合query 條件的前面N 條記錄..// TopDocs topDocs=indexSearcher.search(query, 10);//返回總記錄數(shù)... System.out.println(topDocs.totalHits);//存放的都是document 的idScoreDoc scoreDocs []=topDocs.scoreDocs;for(ScoreDoc scoreDoc:scoreDocs){//返回的就是document idint docID=scoreDoc.doc;//我還需要根據(jù)id 檢索到對(duì)應(yīng)的documentDocument document=indexSearcher.doc(docID);System.out.println("id=="+document.get("id"));System.out.println("title=="+document.get("title"));System.out.println("content=="+document.get("content"));}}}原理分析圖:
demo演示:?
根據(jù)入門代碼流程提煉工具類代碼:
import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version;/*** lucene 工具類...* @author Administrator**/ /*** 提煉規(guī)則,假設(shè)這段代碼可以完成一個(gè)功能,把這個(gè)代碼提煉到一個(gè)方法里面去,假設(shè)這個(gè)方法在某個(gè)業(yè)務(wù)羅繼承可以共用,那么往上抽取,* 假設(shè)在其它邏輯層也可以用,提煉到工具類里面去。* */ public class LuceneUtils {private static IndexWriter indexWriter=null;private static IndexSearcher indexSearcher=null;//索引存放目錄..private static Directory directory=null;private static IndexWriterConfig indexWriterConfig=null;private static Version version=null;private static Analyzer analyzer=null;static {try {directory=FSDirectory.open(new File(Constants.URL));version=Version.LUCENE_44;analyzer=new StandardAnalyzer(version);indexWriterConfig=new IndexWriterConfig(version, analyzer);} catch (IOException e) {e.printStackTrace();}}/*** * @return 返回用于操作索引的對(duì)象...* @throws IOException*/public static IndexWriter getIndexWriter() throws IOException{indexWriter=new IndexWriter(directory, indexWriterConfig);return indexWriter;}/*** 返回用于搜索索引的對(duì)象...* @return* @throws IOException */public static IndexSearcher getIndexSearcher() throws IOException{IndexReader indexReader=DirectoryReader.open(directory);indexSearcher=new IndexSearcher(indexReader);return indexSearcher;}/*** * 返回lucene 當(dāng)前的版本...* @return*/public static Version getVersion() {return version;}/*** * 返回lucene 當(dāng)前使用的分詞器..* @return*/public static Analyzer getAnalyzer() {return analyzer;}} public class Constants {/*** 索引存放的目錄*/public static final String URL="d:/indexdir/news"; }bean:
package cn.itcast.bean;public class Article {private int id;public int getId() {return id;}public void setId(int id) {this.id = id;}public String getTitle() {return title;}public void setTitle(String title) {this.title = title;}public String getContent() {return content;}public void setContent(String content) {this.content = content;}public String getAuthor() {return author;}public void setAuthor(String author) {this.author = author;}public String getUrl() {return url;}public void setUrl(String url) {this.url = url;}private String title;private String content;private String author;private String url;}轉(zhuǎn)換工具類:
package cn.itcast.lucene;import org.apache.lucene.document.Document; import org.apache.lucene.document.IntField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.IndexableField;import cn.itcast.bean.Article;/*8* 對(duì)象與索引庫(kù)document 之間的轉(zhuǎn)化* */ public class ArticleToDocument {public static Document articleToDocument(Article article){Document document=new Document();IntField idfield=new IntField("id", article.getId(), Store.YES);//StringField 對(duì)應(yīng)的值不分詞,textField 分詞..TextField titleField=new TextField("title", article.getTitle(),Store.YES);TextField contentField=new TextField("content", article.getContent(),Store.YES);//修改這個(gè)字段對(duì)應(yīng)的權(quán)重值,默認(rèn)這個(gè)值為1f // contentField.setBoost(3f);StringField authorField=new StringField("author", article.getAuthor(), Store.YES);StringField urlField=new StringField("url", article.getUrl(), Store.YES);document.add(idfield);document.add(titleField);document.add(contentField);document.add(authorField);document.add(urlField);return document;}}Dao層:
package cn.itcast.dao;import java.io.IOException;import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.classic.MultiFieldQueryParser; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs;import cn.itcast.bean.Article; import cn.itcast.lucene.ArticleToDocument; import cn.itcast.uitls.LuceneUtils;/*** 使用lucene 的API 來操作索引庫(kù)..* @author Administrator**/ public class LuceneDao {public void addIndex(Article article) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();Document doc=ArticleToDocument.articleToDocument(article);indexWriter.addDocument(doc);indexWriter.close();}/*** 刪除符合條件的記錄...* @param fieldName* @param fieldValue* @throws IOException*/public void delIndex(String fieldName,String fieldValue) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();//一定要夢(mèng)想,萬(wàn)一實(shí)現(xiàn)了勒Term term=new Term(fieldName, fieldValue);indexWriter.deleteDocuments(term);indexWriter.close();}/*** * 更新* * update table set ? where condtion* @throws IOException * * */public void updateIndex(String fieldName,String fieldValue,Article article) throws IOException{IndexWriter indexWriter=LuceneUtils.getIndexWriter();/*** 1:term 設(shè)置更新的條件...* * 2:設(shè)置更新的內(nèi)容的對(duì)象..* */Term term=new Term(fieldName,fieldValue);Document doc=ArticleToDocument.articleToDocument(article);/*** * 在lucene 里面是先刪除符合這個(gè)條件term 的記錄,在創(chuàng)建一個(gè)doc 記錄...* */indexWriter.updateDocument(term, doc);indexWriter.close();}/*** 0,10* 10,10* 20,10* @param keywords* @throws Exception*/public void findIndex(String keywords,int firstResult,int maxResult) throws Exception{IndexSearcher indexSearcher=LuceneUtils.getIndexSearcher();//第一個(gè)條件.. 單字段查詢... // Query query=new TermQuery(new Term("title","夢(mèng)想"))//select * from table where fieldname="" or content="" String fields []={"title","content"};//第二種條件:使用查詢解析器,多字段。。。 我們需要重新導(dǎo)入一個(gè)jar queryParser 的jar... 位置在lucene解壓后的queryparser文件夾下QueryParser queryParser=new MultiFieldQueryParser(LuceneUtils.getVersion(),fields,LuceneUtils.getAnalyzer());// /這個(gè)事一個(gè)條件..Query query=queryParser.parse(keywords);//query 它是一個(gè)查詢條件,query 是一個(gè)抽象類,不同的查詢規(guī)則構(gòu)造部同的子類即可//檢索符合query 條件的前面N 條記錄...//檢索的是索引目錄... (總記錄數(shù),socreDOC (docID))//使用lucene 提供的api 進(jìn)行操作...TopDocs topDocs=indexSearcher.search(query,firstResult+maxResult); // /存放的是docIDScoreDoc scoreDocs []=topDocs.scoreDocs;//判斷:scoreDocs 的length (實(shí)際取出來的數(shù)量..) 與 firstResult+maxResult 的值取小值...//在java jdk 里面提供了一個(gè)apiint endResult=Math.min(scoreDocs.length, firstResult+maxResult);for(int i=firstResult;i<endResult;i++){ // /取出來的是docID,這個(gè)id 是lucene 自己來維護(hù)。int docID=scoreDocs[i].doc;Document document=indexSearcher.doc(docID);System.out.println("id==="+document.get("id"));System.out.println("title==="+document.get("title"));System.out.println("content==="+document.get("content"));System.out.println("url==="+document.get("url"));System.out.println("author==="+document.get("author"));}} }測(cè)試類:
package cn.itcast.junit;import java.io.IOException;import org.junit.Test;import cn.itcast.bean.Article; import cn.itcast.dao.LuceneDao;/*** 測(cè)試luceneDao* @author Administrator**/ public class LuceneDaoTest {private LuceneDao luceneDao=new LuceneDao();@Testpublic void testCreate() throws IOException{for(int i=28;i<=28;i++){Article article=new Article();article.setId(i);article.setTitle("一定要夢(mèng)想,萬(wàn)一實(shí)現(xiàn)了勒");article.setContent("矯情我覺得這句話太矯情了矯情矯情矯情矯情矯情矯情");article.setUrl("http://www.tianmao.com");article.setAuthor("馬云");luceneDao.addIndex(article);}}@Testpublic void testsearcher() throws Exception{ // article.setTitle("一定要夢(mèng)想,萬(wàn)一實(shí)現(xiàn)了勒"); textfield 分詞 標(biāo)準(zhǔn)分詞器 // article.setContent("我覺得這句話太矯情了"); textfield 分詞 標(biāo)準(zhǔn)分詞器luceneDao.findIndex("夢(mèng)想",20,10);}@Testpublic void testdelete() throws IOException{String fieldName="title";String fieldValue="定";luceneDao.delIndex(fieldName, fieldValue);}@Testpublic void testUpdate() throws IOException{String fieldName="title";String fieldValue="定";Article article=new Article();article.setId(9527);article.setTitle("一定要夢(mèng)想,萬(wàn)一實(shí)現(xiàn)了勒");article.setContent("我覺得這句話太矯情了");article.setUrl("http://www.tianmao.com");article.setAuthor("馬云");luceneDao.updateIndex(fieldName, fieldValue, article);}}?分詞器的流程圖:
?關(guān)于分詞器,網(wǎng)上可以找到很多種類的分詞器配合Lucene使用,相關(guān)分詞規(guī)則查看對(duì)應(yīng)說明。
舉例如下:
Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);//中文單字切分、英文按空格切分成單詞
Analyzer analyzer=new CJKAnalyzer(Version.LUCENE_44);//二分法分詞,中文相連的兩個(gè)詞作為一個(gè)索引
Analyzer analyzer=new IKAnalyzer();//第三方的分詞器,對(duì)中文支持較好,可以自定義分詞單詞與停用詞
?
索引庫(kù)優(yōu)化
package cn.itcast.lucene;import java.io.File; import java.io.IOException;import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LogDocMergePolicy; import org.apache.lucene.index.MergePolicy; import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.RAMDirectory; import org.apache.lucene.util.Version; import org.junit.Test;import cn.itcast.uitls.Constants;public class TestOptimise {/*8* 優(yōu)化的第一種方式:通過 IndexWriterConfig 優(yōu)化設(shè)置mergePolicy(合并策略)* * */public void testoptimise() throws IOException{Directory directory=FSDirectory.open(new File(Constants.URL));Analyzer analyzer=new StandardAnalyzer(Version.LUCENE_44);IndexWriterConfig config=new IndexWriterConfig(Version.LUCENE_44, analyzer);LogDocMergePolicy mergePolicy=new LogDocMergePolicy();/*** 當(dāng)這個(gè)值越小,更少的內(nèi)存會(huì)被運(yùn)用當(dāng)創(chuàng)建索引的時(shí)候,搜索的時(shí)候越快,創(chuàng)建的時(shí)候越慢。* 當(dāng)這個(gè)值越大,更多的內(nèi)存會(huì)被運(yùn)用當(dāng)創(chuàng)建索引的時(shí)候,搜索的時(shí)候越慢,創(chuàng)建的時(shí)候越快..* larger values >10* * 2<=smaller<=10* *///設(shè)置合并因子..mergePolicy.setMergeFactor(10); // /設(shè)置索引的合并策略.. config.setMergePolicy(mergePolicy);IndexWriter indexWriter=new IndexWriter(directory, config);}/*** 通過directory 去優(yōu)化....* @throws IOException * */@Testpublic void testoptimise2() throws IOException{//現(xiàn)在的索引放在硬盤上面...Directory directory=FSDirectory.open(new File(Constants.URL)); // /通過這個(gè)對(duì)象吧directory 里面的數(shù)據(jù)讀取到directory1 里面來..IOContext ioContext=new IOContext();//相辦法吧directory 的索引讀取到內(nèi)存當(dāng)中來...Directory directory1=new RAMDirectory(directory,ioContext);IndexReader indexReader=DirectoryReader.open(directory1);IndexSearcher indexSearcher=new IndexSearcher(indexReader);Query query=new TermQuery(new Term("title", "想"));TopDocs topDocs=indexSearcher.search(query, 100);System.out.println(topDocs.totalHits);}/*** 索引文件越大,會(huì)影響檢索的速度.. (減少索引文件的大小)* * 1:排除停用詞..* */public void testoptimise3(){}/*** 將索引分目盤存放 將數(shù)據(jù)歸類...* */public void testoptimise4(){} }?
轉(zhuǎn)載于:https://www.cnblogs.com/lm970585581/p/9410322.html
總結(jié)
- 上一篇: return 的使用
- 下一篇: SpringBoot随笔