Lucene全文检索之HelloWorld
1.下载Lucene4.4 然后解压
2.新建一个Java项目,名称为HelloLucene
3.新建一个lib文件夹,将需要的jar文件复制到lib中,本项目所需要的jar文件如下:
[图]
然后将这些jar文件添加到buildPath中
3.新建一个包com.njupt.zhb,新建一个类:HelloLucene.java,代码如下
[java code]
package com.njupt.zhb;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
/*
*@author: ZhengHaibo
*2013-7-05 Nanjing,njupt,China
*/
public class HelloLucene {
/**
* Index all text files under a directory.
* String indexPath = "index";//索引保存的路径
* String docsPath = "";//文档保存的路径(待索引)
*/
public void index(String indexPath,String docsPath) {
try {
// 1.创建Directory
Directory dir = FSDirectory.open(new File(indexPath));//保存在硬盘上
// 2.创建IndexWriter
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_44,
analyzer);
iwc.setOpenMode(OpenMode.CREATE_OR_APPEND);// 设置创建或追加模式
IndexWriter writer = new IndexWriter(dir, iwc);
final File docDir = new File(docsPath);
indexDocs(writer, docDir);
writer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public void indexDocs(IndexWriter writer, File file) throws IOException {
if (file.canRead()) {
if (file.isDirectory()) {//如果是文件夹,则遍历文件夹内的所有文件
String[] files = file.list();
// an IO error could occur
if (files != null) {
for (int i = 0; i < files.length; i++) {
indexDocs(writer, new File(file, files[i]));
}
}
} else {//如果是文件
FileInputStream fis;
try {
fis = new FileInputStream(file);
} catch (FileNotFoundException fnfe) {
return;
}
try {
// 3.创建Document对象
Document doc = new Document();
// 4.为Document添加Field
// Add the path of the file as a field named "path". Use a
// field that is indexed (i.e. searchable), but don't
// tokenize
// the field into separate words and don't index term
// frequency
// or positional information:
//以文件的文件路径建立Field
Field pathField = new StringField("path", file.getPath(),Field.Store.YES);
doc.add(pathField);//添加到文档中
//以文件的名称建立索引域
doc.add( new StringField("filename", file.getName(),Field.Store.YES));//添加到文档中
// Add the last modified date of the file a field named
// "modified".
// Use a LongField that is indexed (i.e. efficiently
// filterable with
// NumericRangeFilter). This indexes to milli-second
// resolution, which
// is often too fine. You could instead create a number
// based on
// year/month/day/hour/minutes/seconds, down the resolution
// you require.
// For example the long value 2011021714 would mean
// February 17, 2011, 2-3 PM.
doc.add(new LongField("modified", file.lastModified(),Field.Store.YES));
// Add the contents of the file to a field named "contents".
// Specify a Reader,
// so that the text of the file is tokenized and indexed,
// but not stored.
// Note that FileReader expects the file to be in UTF-8
// encoding.
// If that's not the case searching for special characters
// will fail.
//以文件的内容建立索引域(Field)
doc.add(new TextField("contents", new BufferedReader(new InputStreamReader(fis, "UTF-8"))));
if (writer.getConfig().getOpenMode() == OpenMode.CREATE) {
// New index, so we just add the document (no old
// document can be there):
System.out.println("adding " + file);
writer.addDocument(doc);//将文档写入到索引中(以创建的方式)
} else {
// Existing index (an old copy of this document may have
// been indexed) so
// we use updateDocument instead to replace the old one
// matching the exact
// path, if present:
System.out.println("updating " + file);
writer.updateDocument(new Term("path", file.getPath()),doc);//以追加方式写入到索引中
}
} finally {
fis.close();
}
}
}
}
/**
* 搜索
*/
public void searcher(String indexPath){
try {
IndexReader reader = DirectoryReader.open(FSDirectory.open(new File(indexPath)));
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_44);
String field = "contents";//搜索域是:文档的内容
QueryParser parser = new QueryParser(Version.LUCENE_44, field, analyzer);
Query query= parser.parse("南京");//搜索内容中含有“南京”的文档
TopDocs tds=searcher.search(query, 10);//搜索前十个
ScoreDoc[] sds= tds.scoreDocs;
for (ScoreDoc sd:sds) {//将内容中含有“南京”关键字的文档遍历一遍
Document document=searcher.doc(sd.doc);
System.out.println("score:"+sd.score+"--filename:"+document.get("filename")+
"--path:"+document.get("path")+"--time"+document.get("modified"));//打印检索结果中文档的路径
}
reader.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
4.为了实验,我在D盘下新建了一个文件夹lucene,里面有三个文件,它们的内容如下:
lucene1.txt
Nanjing University Of Posts & Telec
lucene2
. txt
南京邮电大学
北京市海淀区
上海市南京路
lucene3. txt
2014南京青奥会
5.新建一个Junit测试类,对index函数和searcher函数进行测试,代码如下:
[java code]
package com.njupt.zhb;
import org.junit.Test;
/*
*@author: ZhengHaibo
*2013-7-05 Nanjing,njupt,China
*/
public class TestJunit {
@Test
public void TestIndex(){
HelloLucene hLucene=new HelloLucene();
hLucene.index("index", "D:\\lucene");
}
@Test
public void TestSearcher(){
HelloLucene hLucene=new HelloLucene();
hLucene.searcher("index");
}
}
以Junit方式运行TestIndex函数,运行结果如下:
updating D:\lucene\lucene1.txt
updating D:\lucene\lucene2.txt
updating D:\lucene\lucene3.txt
索引建立完成!
在项目目录的index目录下,就生成了如下索引文件:
[图]
6.搜索,测试TestSearcher函数,运行结果如下:
score:0.53033006--filename:lucene3.txt--path:D:\lucene\lucene3.txt--time1376828819375
score:0.48666292--filename:lucene2.txt--path:D:\lucene\lucene2.txt--time1376828783791
可以看出,我们这里只是把分数打印出来了,并没有排名,分数越小,说明越相似!
未经允许不得用于商业目的