最近一段时间由于公司需要 ,模糊搜索出相似的关键词,所以直接考虑使用了Lucene。Lucene允许你往程序中添加搜索功能,Lucene能够把你从文本中解析出来的数据进行索引和搜索 ,Lucene不关心数据来源 甚至不关心语种,不过你需要把它转换成文本格式。也就是说你可以搜索 html网页,文本文档,word文档 ,pdf,或者其他一些 总之 只要能够提取出文本信息的即可。同样你也可以利用Lucene来索引存储在数据库中的数据,以给你的用户提供一些 比如 全文搜索功能等 ,反正Lucene的功能很是强大。里面还有很多开源的对不同语言进行分析的插件等。下面我介绍一个例子 ,这里我进行对 一个txt文档的 每一行进行了 索引的添加 ,也就是说 把每一行 当作一个document对象来处理,实际上在Lucene中 每一个document 相当于我们在数据库中的库名, 而每个field相当于我们的表名 ,它能够对文本进行自动处理去掉里面的一些语气词,它能把你规定的域当作关键词来进行索引 以备查询时使用,Lucene比较容易使用 ,但是不如数据库灵活,速度很快。下面 我用一个例子来说明(这里我用的Lucene4.7.2,最高版本 ,你需要注意把需要的一些jar包引入的到你的工程中,使用maven可直接引入依赖http://mvnrepository.com/artifact/org.apache.Lucene需要的全部引入)我这里写了一个实例 你可以进行参考学习使用方法。--------------------------------------分割线 --------------------------------------基于Lucene多索引进行索引和搜索 http://www.linuxidc.com/Linux/2012-05/59757.htmLucene 实战(第2版) 中文版 配套源代码 http://www.linuxidc.com/Linux/2013-10/91055.htmLucene 实战(第2版) PDF高清中文版 http://www.linuxidc.com/Linux/2013-10/91052.htm使用Lucene-Spatial实现集成地理位置的全文检索 http://www.linuxidc.com/Linux/2012-02/53117.htmLucene + Hadoop 分布式搜索运行框架 Nut 1.0a9 http://www.linuxidc.com/Linux/2012-02/53113.htmLucene + Hadoop 分布式搜索运行框架 Nut 1.0a8 http://www.linuxidc.com/Linux/2012-02/53111.htmLucene + Hadoop 分布式搜索运行框架 Nut 1.0a7 http://www.linuxidc.com/Linux/2012-02/53110.htmProject 2-1: 配置Lucene, 建立WEB查询系统[Ubuntu 10.10] http://www.linuxidc.com/Linux/2010-11/30103.htm--------------------------------------分割线 --------------------------------------package lucene.home.clq;/** * @author chenlongquan * Copyright Manning Publications Co..com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan *///创建索引 import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.TextField; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.Directory; import org.apache.lucene.util.Version; import java.io.BufferedReader; import java.io.File; import java.io.FileFilter; import java.io.FileInputStream; import java.io.IOException; import java.io.FileReader; import java.io.InputStreamReader; import java.io.LineNumberReader; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; /** * This code was originally build for the index * */ public class Indexer { public static void main(String[] args) throws Exception {
String indexDir = "f:\index"; //1 String dataDir = "f:\baidu"; //2 long start = System.currentTimeMillis(); Indexer indexer = new Indexer(indexDir); int numIndexed; try { numIndexed = indexer.index(dataDir, new TextFilesFilter()); } finally { indexer.close(); } long end = System.currentTimeMillis(); System.out.println("Indexing " + numIndexed + " files took " + (end - start) + " milliseconds"); } private IndexWriter writer; public Indexer(String indexDir) throws IOException { Directory dir = FSDirectory.open(new File(indexDir)); writer = new IndexWriter(dir,indexWriterConfig()); //在这里进行索引的调试 } public void close() throws IOException { writer.close(); //4 } private IndexWriterConfig indexWriterConfig() { Analyzer analyzer = new SmartChineseAnalyzer(Version.LUCENE_47); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer); return config; } public int index(String dataDir, FileFilter filter) throws Exception { File[] files = new File(dataDir).listFiles(); for (File f: files) { if (!f.isDirectory() && !f.isHidden() && f.exists() && f.canRead() && (filter == null || filter.accept(f))) { indexFile(f); } } return writer.numDocs(); //5 } private static class TextFilesFilter implements FileFilter { public boolean accept(File path) { return path.getName().toLowerCase() //6 .endsWith(".txt"); //6 } }
//10 } //读取一个文件 private List<String> readFile(File filePathAndName)throws IOException {FileInputStream fis = new FileInputStream(filePathAndName); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); BufferedReader br = new BufferedReader(isr); LineNumberReader lnr = new LineNumberReader(br);List<String> returnValue = new ArrayList<String>(); int cnt = 0; while (true) { cnt++; String tempStr = lnr.readLine(); if (tempStr == null) break; if (tempStr.length() < 2) continue; returnValue.add(tempStr); } lnr.close(); br.close(); isr.close(); fis.close(); return returnValue; } //读取一个文件并排重后返回 public static List<String> readFileNoDup(File filePathAndName) throws IOException {
FileInputStream fis = new FileInputStream(filePathAndName); InputStreamReader isr = new InputStreamReader(fis, "UTF-8"); BufferedReader br = new BufferedReader(isr); LineNumberReader lnr = new LineNumberReader(br);Set<String> set = new HashSet<String>(); while (true) { String tempStr = lnr.readLine(); if (tempStr == null) break; if (tempStr.length() < 2) continue; set.add(tempStr.trim()); } lnr.close(); br.close(); isr.close(); fis.close(); List<String> returnValue = new ArrayList<String>(set.size()); returnValue.addAll(set); return returnValue; } }//对刚才已经建好的索引进行搜索package lucene.home.clq;
/** * Copyright Manning Publications Co. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific lan */ import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopFieldCollector; import org.apache.lucene.search.TopFieldDocs; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.SimpleFSDirectory; import org.apache.lucene.util.Version; // From chapter 1 /** * This code was originally written for searcher * */ public class Searcher { public static void main(String[] args) throws IllegalArgumentException, IOException, ParseException {