使用肖波的KTDictSeg分词器 为Lucene.net服务2011-12-05 博客园 雨中漫步的太阳最近在看Lucene.net 发现Lucene.net的中文分词资料不是很多,很早就在看肖波的KTDictSeg,觉的分词效果不错,但是没有lucene接口,看他的blog也是很长时间没有更新了 他在他的blog中提到将在下一个版本中提供对lucene的支持,我这里期待中...同时blog中提到一挥的修改版本,但是一挥的站打不开了,不知道什么原因,我刚刚看这个时间不长,查了些资料 写了下面的代码实现了KTDictSeg在Lucene.net中的调用,期待有更好的方法出现下面附上代码
1using System;2using System.Collections.Generic;3using System.Text;4using System.IO;5using Lucene.Net;6using Lucene.Net.Analysis;78namespace Lucene.Net.Analysis.KTDictSeg9{10 public class KTDictSegAnalyzer:Analyzer11 {12 public KTDictSegAnalyzer()13 {14 }1516 public override TokenStream TokenStream(string fieldName, TextReader reader)17 {18 TokenStream result = new KTDictSegTokenizer(reader);19 result = new LowerCaseFilter(result); 20 return result;21 }22 }23} using System;using System.Collections.Generic;using System.Text;using System.IO;using System.Collections;using Lucene.Net;using Lucene.Net.Analysis;using KTDictSeg; namespace Lucene.Net.Analysis.KTDictSeg{ public class KTDictSegTokenizer:Tokenizer { public static CSimpleDictSeg m_SimpleDictSeg; private ArrayList ioBuffer; private int offSet = 0 ; //偏移量. private int position = -1 ; //词汇在缓冲中的位置. private int length = 0 ; //词汇的长度. private int start = 0 ; //开始偏移量. public KTDictSegTokenizer(System.IO.TextReader input) : base(input) { //这里用了一个第三方的中文分词组件. //ioBuffer = Sj110.Com.Chinese.Tokenizer.Tokenize(input.ReadToEnd()); if (m_SimpleDictSeg == null) { try { m_SimpleDictSeg = new CSimpleDictSeg(); m_SimpleDictSeg.DictPath = Path.Combine(Environment.CurrentDirectory, "Data") + Path.DirectorySeparatorChar; m_SimpleDictSeg.LoadDict(); } catch (Exception e1) { m_SimpleDictSeg = null; throw e1; } } m_SimpleDictSeg.FilterStopWords = true; m_SimpleDictSeg.MatchName = true; ioBuffer = m_SimpleDictSeg.Segment(input.ReadToEnd()); } //DotLucene的分词器简单来说,就是实现Tokenizer的Next方法,把分解出来的每一个词构造为一个Token,因为Token是DotLucene分词的基本单位。 public override Token Next() { position++; if (position < ioBuffer.Count) { length = ioBuffer[position].ToString().Length; start = offSet ; offSet += length ; return new Token(ioBuffer[position].ToString(), start, start + length); } return null; } }}