Lucene给文本索引和搜索功能的应用

最近一段时间由于公司需要，模糊搜索出相似的关键词，所以直接考虑使用了Lucene。Lucene允许你往程序中添加搜索功能，Lucene能够把你从文本中解析出来的数据进行索引和搜索，Lucene不关心数据来源甚至不关心语种，不过你需要把它转换成文本格式。也就是说你可以搜索 html网页，文本文档，word文档，pdf，或者其他一些总之只要能够提取出文本信息的即可。同样你也可以利用Lucene来索引存储在数据库中的数据，以给你的用户提供一些比如全文搜索功能等，反正Lucene的功能很是强大。里面还有很多开源的对不同语言进行分析的插件等。下面我介绍一个例子，这里我进行对一个txt文档的每一行进行了索引的添加，也就是说把每一行当作一个document对象来处理，实际上在Lucene中每一个document 相当于我们在数据库中的库名，而每个field相当于我们的表名，它能够对文本进行自动处理去掉里面的一些语气词，它能把你规定的域当作关键词来进行索引以备查询时使用，Lucene比较容易使用，但是不如数据库灵活，速度很快。下面我用一个例子来说明（这里我用的Lucene4.7.2,最高版本，你需要注意把需要的一些jar包引入的到你的工程中，使用maven可直接引入依赖http://mvnrepository.com/artifact/org.apache.Lucene需要的全部引入）我这里写了一个实例你可以进行参考学习使用方法。--------------------------------------分割线 --------------------------------------基于Lucene多索引进行索引和搜索 http://www.linuxidc.com/Linux/2012-05/59757.htmLucene 实战（第2版）中文版配套源代码 http://www.linuxidc.com/Linux/2013-10/91055.htmLucene 实战（第2版） PDF高清中文版 http://www.linuxidc.com/Linux/2013-10/91052.htm使用Lucene-Spatial实现集成地理位置的全文检索 http://www.linuxidc.com/Linux/2012-02/53117.htmLucene + Hadoop 分布式搜索运行框架 Nut 1.0a9 http://www.linuxidc.com/Linux/2012-02/53113.htmLucene + Hadoop 分布式搜索运行框架 Nut 1.0a8 http://www.linuxidc.com/Linux/2012-02/53111.htmLucene + Hadoop 分布式搜索运行框架 Nut 1.0a7 http://www.linuxidc.com/Linux/2012-02/53110.htmProject 2-1: 配置Lucene, 建立WEB查询系统[Ubuntu 10.10] http://www.linuxidc.com/Linux/2010-11/30103.htm--------------------------------------分割线 --------------------------------------package lucene.home.clq;/**
* @author chenlongquan
* Copyright Manning Publications Co..com
*
* Licensed under the Apache License, Version 2.0 （the "License"）;
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*///创建索引
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.FileReader;
import java.io.InputStreamReader;
import java.io.LineNumberReader;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
/**
* This code was originally build for the index
*
*/
public class Indexer { public static void main（String[] args） throws Exception {

String indexDir = "f:\index"; //1
String dataDir = "f:\baidu"; //2
long start = System.currentTimeMillis（）;
Indexer indexer = new Indexer（indexDir）;
int numIndexed;
try {
numIndexed = indexer.index（dataDir, new TextFilesFilter（））;
} finally {
indexer.close（）;
}
long end = System.currentTimeMillis（）; System.out.println（"Indexing " + numIndexed + " files took "
+ （end - start） + " milliseconds"）;
}
private IndexWriter writer;
public Indexer（String indexDir） throws IOException {
Directory dir = FSDirectory.open（new File（indexDir））;
writer = new IndexWriter（dir,indexWriterConfig（））;
//在这里进行索引的调试
} public void close（） throws IOException {
writer.close（）; //4
}
private IndexWriterConfig indexWriterConfig（）
{
Analyzer analyzer = new SmartChineseAnalyzer（Version.LUCENE_47）;
IndexWriterConfig config = new IndexWriterConfig（Version.LUCENE_47, analyzer）;
return config;
}
public int index（String dataDir, FileFilter filter）
throws Exception {
File[] files = new File（dataDir）.listFiles（）;
for （File f: files） {
if （！f.isDirectory（） &&
！f.isHidden（） &&
f.exists（） &&
f.canRead（） &&
（filter == null || filter.accept（f））） {
indexFile（f）;
}
}
return writer.numDocs（）; //5
}
private static class TextFilesFilter implements FileFilter {
public boolean accept（File path） {
return path.getName（）.toLowerCase（） //6
.endsWith（".txt"）; //6
}
}

/**
* 遍历每一个文件，然后读出文件中的每一行数据，当成一个document来处理
* @param f
* @throws Exception
*/
private void indexFile（File f） throws Exception {
System.out.println（"Indexing " + f.getCanonicalPath（））;
// Document doc = getDocument（f）;
List<String> lists = readFileNoDup（f）;
for（String list:lists）{
Document doc = new Document（）;
doc.add（new Field（"contents",list,TextField.TYPE_STORED））;
writer.addDocument（doc）;
}

//10
}
//读取一个文件
private List<String> readFile（File filePathAndName）throws IOException {FileInputStream fis = new FileInputStream（filePathAndName）;
InputStreamReader isr = new InputStreamReader（fis, "UTF-8"）;
BufferedReader br = new BufferedReader（isr）;
LineNumberReader lnr = new LineNumberReader（br）;List<String> returnValue = new ArrayList<String>（）;
int cnt = 0;
while （true） {
cnt++;
String tempStr = lnr.readLine（）;
if （tempStr == null）
break;
if （tempStr.length（） < 2）
continue;
returnValue.add（tempStr）;
}
lnr.close（）;
br.close（）;
isr.close（）;
fis.close（）;
return returnValue;
}
//读取一个文件并排重后返回
public static List<String> readFileNoDup（File filePathAndName）
throws IOException {

FileInputStream fis = new FileInputStream（filePathAndName）;
InputStreamReader isr = new InputStreamReader（fis, "UTF-8"）;
BufferedReader br = new BufferedReader（isr）;
LineNumberReader lnr = new LineNumberReader（br）;Set<String> set = new HashSet<String>（）;
while （true） {
String tempStr = lnr.readLine（）;
if （tempStr == null）
break;
if （tempStr.length（） < 2）
continue;
set.add（tempStr.trim（））;
}
lnr.close（）;
br.close（）;
isr.close（）;
fis.close（）;
List<String> returnValue = new ArrayList<String>（set.size（））;
returnValue.addAll（set）;
return returnValue;
}
}//对刚才已经建好的索引进行搜索package lucene.home.clq;

/**
* Copyright Manning Publications Co.
*
* Licensed under the Apache License, Version 2.0 （the "License"）;
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific lan
*/
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TopFieldCollector;
import org.apache.lucene.search.TopFieldDocs;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
// From chapter 1
/**
* This code was originally written for searcher
*
*/
public class Searcher {
public static void main（String[] args） throws IllegalArgumentException,
IOException, ParseException {

final String indexDir = "e:\soso\soso";
String q = " ";//输入你添加的所以进行模糊搜索
docs = query（indexDir, q）

}

public static void search（String indexDir, String q）
throws IOException, ParseException {
IndexReader reader = DirectoryReader.open（FSDirectory.open（new File（indexDir）））;
// Directory dir = FSDirectory.open（new File（indexDir））; //3
IndexSearcher is = new IndexSearcher（reader）; //3
QueryParser parser = new QueryParser（Version.LUCENE_47,"contents",new SmartChineseAnalyzer（Version.LUCENE_47））;
Query query = parser.parse（q）; //4
long start = System.currentTimeMillis（）;
TopDocs hits = is.search（query, 500）; //5

//ScoreDoc[] hits = is.search（query, null, 10）.scoreDocs;
long end = System.currentTimeMillis（）;
System.err.println（"Found " + hits.totalHits + //6
" document（s）（in " + （end - start） + // 6
" milliseconds） that matched query "" + // 6
q + "":"）; // 6
for（ScoreDoc scoreDoc : hits.scoreDocs） {
Document doc = is.doc（scoreDoc.doc）; //7
System.out.println（doc.get（"contents"））;
}
reader.close（）;
}

private static List<String> query（String indexDir, String searcher） throws IOException, ParseException{
if （searcher == null || searcher.length（） == -1） {
return null;
}

searcher = searcher.trim（）;
if （searcher.length（） == 0） {
return null;
}

IndexReader reader = DirectoryReader.open（FSDirectory.open（new File（indexDir）））;//open the index
//IndexReader reader = DirectoryReader.open（SimpleFSDirectory.open（new File（indexDir）））;//open the index
IndexSearcher is = new IndexSearcher（reader）;//find the content
QueryParser parser = new QueryParser（Version.LUCENE_47, "contents", new SmartChineseAnalyzer（Version.LUCENE_47））;//parser the content
Query query = parser.parse（searcher）;
TopFieldDocs hits = is.search（query, 100, new Sort（new SortField（"contents", SortField.Type.SCORE, false）））;
TopDocs hits1 = is.search（query, 200）;//搜索出前200条数据按照评分进行排序
List<String> list = new ArrayList<String>（）;
for（ScoreDoc scoreDoc : hits.scoreDocs）{
Document doc = is.doc（scoreDoc.doc）;
list.add（doc.get（"contents"））;
}
reader.close（）;
return list;
}
}//这里我主要给文档中的文本进行添加了索引，你也可以在Field 中给路径等等一些属性进行添加索引具体你可以搜索lucene api

进行使用里面的一些方法。我这里说的比较粗，有问题欢迎讨论。Lucene 的详细介绍：请点这里
Lucene 的下载地址：请点这里本文永久更新链接地址：http://www.linuxidc.com/Linux/2014-07/104322.htm