| 因為lucene索引的時候是將String型的信息建立索引的,所以這里必須是將word/pdf/html等文件的內(nèi)容轉(zhuǎn)化問字符型。 lucene的jar包自己去下載。 首先是建立索引的代碼: public class TextFileIndexer {??? public static void main(String[] args) throws Exception {??? /* 指明要索引文件夾的位置,這里是d盤的s文件夾下 */ ???????? File fileDir = new File("d:\\s");??? /* 這里放索引文件的位置 */ ???????? File indexDir = new File("d:\\index");??? ???????? Analyzer luceneAnalyzer = new StandardAnalyzer();??? ???????? IndexWriter indexWriter = new IndexWriter(indexDir, luceneAnalyzer,??? true);??? ???????? File[] textFiles = fileDir.listFiles();??? long startTime = new Date().getTime();??? //增加document到索引去 ???????????????? System.out.println("File正在被索引.");?? /* ????????????????? * 注意要變的就是這里,路徑和讀取文件的方法 ????????????????? * */ ???????????????? String path ="d:\\s\\2.doc"; ???????????????? String temp = ReadFile.readWord(path); //???????????????? String path ="d:\\s\\index.htm"; //???????????????? String temp = ReadFile.readHtml(path); ???????????????? Document document = new Document();??? ???????????????? Field FieldPath = new Field("path",path, ???????????????????????? Field.Store.YES, Field.Index.NO);??? ???????????????? Field FieldBody = new Field("body", temp, Field.Store.YES,??? ???????????????????????? Field.Index.TOKENIZED,??? ???????????????????????? Field.TermVector.WITH_POSITIONS_OFFSETS);??? ???????????????? document.add(FieldPath);??? ???????????????? document.add(FieldBody);??? ???????????????? indexWriter.addDocument(document);??? //optimize()方法是對索引進(jìn)行優(yōu)化 ???????? indexWriter.optimize();??? ???????? indexWriter.close();??? //測試一下索引的時間 long endTime = new Date().getTime();??? ???????? System.out??? ???????????????? .println("這花費(fèi)了"?? ??????????????????????? + (endTime - startTime)??? ??????????????????????? + " 毫秒來把文檔增加到索引里面去!"?? ??????????????????????? + fileDir.getPath());??? ???? }?? } 上面已經(jīng)注釋了要換的地方,我們要做的就是換文件的路徑和讀取文件的方法。 下面來具體看下讀取文件的方法 1.首先來看WORD文檔: 我這里用的是poi,相關(guān)jar包自己去下載,然后加到工程中(以下所要用的jar包也是,不再重復(fù)說)。 來看相關(guān)代碼: public static String readWord(String path) { ???????? StringBuffer content = new StringBuffer("");// 文檔內(nèi)容 try { ???????????? HWPFDocument doc = new HWPFDocument(new FileInputStream(path)); ???????????? Range range = doc.getRange(); int paragraphCount = range.numParagraphs();// 段落 for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數(shù)據(jù) ???????????????? Paragraph pp = range.getParagraph(i); ???????????????? content.append(pp.text()); ???????????? } ???????? } catch (Exception e) { ???????? } return content.toString().trim(); ???? } 2.PDF文件用的是PDFbox: public static String readPdf(String path) throws Exception { ???????? StringBuffer content = new StringBuffer("");// 文檔內(nèi)容 ???????? FileInputStream fis = new FileInputStream(path); ???????? PDFParser p = new PDFParser(fis); ???????? p.parse(); ???????? PDFTextStripper ts = new PDFTextStripper(); ???????? content.append(ts.getText(p.getPDDocument())); ???????? fis.close(); return content.toString().trim(); ???? } 3.html文件: public static String readHtml(String urlString) { ???????? StringBuffer content = new StringBuffer(""); ???????? File file = new File(urlString); ???????? FileInputStream fis = null; try { ???????????? fis = new FileInputStream(file); // 讀取頁面 ???????????? BufferedReader reader = new BufferedReader(new InputStreamReader( ???????????????????? fis,"utf-8"));//這里的字符編碼要注意,要對上html頭文件的一致,否則會出亂碼 ???????????? String line = null; while ((line = reader.readLine()) != null) { ???????????????? content.append(line + "\n"); ???????????? } ???????????? reader.close(); ???????? } catch (Exception e) { ???????????? e.printStackTrace(); ???????? } ???????? String contentString = content.toString(); return contentString; ???? } 4.txt文件: public static String readTxt(String path) { ???????? StringBuffer content = new StringBuffer("");// 文檔內(nèi)容 try { ???????????? FileReader reader = new FileReader(path); ???????????? BufferedReader br = new BufferedReader(reader); ???????????? String s1 = null; while ((s1 = br.readLine()) != null) { ???????????????? content.append(s1 + "\r"); ???????????? } ???????????? br.close(); ???????????? reader.close(); ???????? } catch (IOException e) { ???????????? e.printStackTrace(); ???????? } return content.toString().trim(); ???? } 接下來數(shù)搜索代碼: public class TestQuery {??? public static void main(String[] args) throws IOException, ParseException {??? ???????? Hits hits = null;??? //搜索內(nèi)容自己換 ???????? String queryString = "根據(jù)國務(wù)院的決定";??? ???????? Query query = null;?? ???????? IndexSearcher searcher = new IndexSearcher("d:\\index"); //這里注意索引存放的路徑 ???????? Analyzer analyzer = new StandardAnalyzer();??? try {??? ???????????? QueryParser qp = new QueryParser("body", analyzer);??? /** ????????????? * 建索引的時候我們指定了body建立為內(nèi)容,我們搜索的時候也是針對body的,所以 ????????????? *??? QueryParser qp = new QueryParser("body", analyzer); ????????????? *??? 這句和建立索引時候 ???????????????? Field FieldBody = new Field("body", temp, Field.Store.YES,??? ???????????????????????? Field.Index.TOKENIZED,??? ???????????????????????? Field.TermVector.WITH_POSITIONS_OFFSETS); ????????????? *的這句的"body"是對應(yīng)的。 ???????????? */ ???????????? query = qp.parse(queryString);??? ???????? } catch (ParseException e) { ???????????? System.out.println("異常"); ???????? }??? if (searcher != null) {??? ???????????? hits = searcher.search(query);??? if (hits.length() > 0) {??? ???????????????? System.out.println("找到:" + hits.length() + " 個結(jié)果!");?? for (int i = 0; i < hits.length(); i++) {//輸出搜索信息 ????????????????????? Document document = hits.doc(i); ????????????????????? System.out.println("contents:"+document.get("body")); //同樣原理這里的document.get("body")就是取得建立在索引文件里面的額body的所有內(nèi)容 ???????????????????? //你若想輸出文件路徑就用document.get("path")就可以了 ???????????????? } ???????????? } else{ ???????????????? System.out.println("0個結(jié)果!"); ???????????? }??? ???????? }?? ???? } |