也许很多朋友没有luncene,但没关系,你可以认为他仅仅是“数据库”,或者文档库更合适。
这也是我们网站有又一次数据库服务器被不知道谁提掉网线,仍然运行了1天没人发现,-_-!.
既然luncene是文档库结构模型(不知道这样称呼是否合适),那我们先来了解luncene 一些基本的组成:
整个索引文件可认为张大表,实际上也是,在索引是luncene目前的版本使用的倒排序的方式存储的。
Hits :命中文档集合,可以认为是行集
Document:文档,一行数据。
Field:一个单元数据包括名称和它的值
了解上面之后,让从程序中看看是怎么回事儿:
(注意以下代码都是在 luncene 2.0的版本下的)
public class Formater
{
/// <summary>
/// 时间格式化
/// </summary>
/// <param name="time"></param>
/// <returns></returns>
public static string FormatTime(DateTime time)
{
return time.ToString("yyyyMMddhhmmss");
}
/// <summary>
/// 时间格式化
/// </summary>
/// <param name="time"></param>
/// <returns></returns>
public static string FormatTime(string str)
{
DateTime time = new DateTime(2000, 1, 1);
return FormatTime(time);
} // 格式化文字类型
public static string FormatNum(string s)
{
if (s.Length > 9) return s;
return s.PadLeft(9, '0');
} /// <summary>
/// 格式化搜索参数
/// </summary>
/// <param name="parm"></param>
/// <returns></returns>
public static Query FormatSearchItem(SearchParameter parm)
{
List<Query> list = new List<Query>();
Query squery = null; foreach (QueryItem item in parm.QueryItems)
{
item.FieldName = item.FieldName.ToUpper();
if (item.IsToLower)
{
item.Value = item.Value.ToLower();
item.Value_1 = item.Value_1.ToLower();
}
QueryParser queryparser;
Query query;
switch (item.Type)
{
case QueryItemType.Or:
{
queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
queryparser.SetDefaultOperator(QueryParser.Operator.OR);
query = queryparser.Parse(item.Value);
break;
}
case QueryItemType.Range:
{
queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
queryparser.SetDefaultOperator(QueryParser.Operator.AND);
query = queryparser.Parse(string.Format("{0}:[{1} TO {2}]", item.FieldName, Format(item.Value)));
break;
}
case QueryItemType.Fuzzy:
{
queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
queryparser.SetDefaultOperator(QueryParser.Operator.AND);
query = queryparser.Parse(item.Value + "~");
break;
}
case QueryItemType.Wildcard:
{
queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
queryparser.SetDefaultOperator(QueryParser.Operator.AND);
query = queryparser.Parse(item.Value + "*");
break;
}
case QueryItemType.TimeRange:
{
queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
queryparser.SetDefaultOperator(QueryParser.Operator.AND);
query = queryparser.Parse(string.Format("{0}:[{1} TO {2}]", item.FieldName, item.Value, item.Value_1));
break;
}
default:
{
queryparser = new QueryParser(item.FieldName, GetAnalyzer(parm.AnalyzerType));
queryparser.SetDefaultOperator(QueryParser.Operator.AND);
query = queryparser.Parse(item.Value);
break;
} }
if (squery == null)
{
squery = query;
}
list.Add(query);
}
squery = squery.Combine(list.ToArray());
return squery;
} /// <summary>
/// 获取分词器
/// </summary>
/// <param name="type"></param>
/// <returns></returns>
public static Analyzer GetAnalyzer(AnalyzerEnum type)
{
switch (type)
{
case AnalyzerEnum.ChineseAnalyzer:
{
return new ChineseAnalyzer();
}
case AnalyzerEnum.DoubleWordAnalyzer:
{
return new DoubleWordAnalyzer();
}
case AnalyzerEnum.ChineseWordAnalyzer:
{
return new ChineseWordAnalyzer();
}
case AnalyzerEnum.CustomAnalyzer:
{
return new CustomAnalyzer();
}
case AnalyzerEnum.SmartSegmentAnalyzer:
{
return new SmartSegmentAnalyzer();
}
default:
{
return new StandardAnalyzer();
}
}
}
// 格式化文字类型
public static string Format(string s)
{
int temp = 0;
int.TryParse(s, out temp);
temp = temp * 100;
s = temp.ToString(); if (s.Length > 8) return s;
return s.PadLeft(8, '0');
}
}
由于有时候我们需要对结果进行排序,但lucene只能按字符来排序,所以我们必须将数字和时间转化成可排序得字符。
索引参数:
public enum AnalyzerEnum : int
{
StandardAnalyzer,
ChineseAnalyzer,
DoubleWordAnalyzer,
ChineseWordAnalyzer,
CustomAnalyzer,
SmartSegmentAnalyzer }
[Serializable]
public class IndexParameter
{
private AnalyzerEnum _analyzertype;
private string _indexdir;
private bool _rebuildIndex; private int maxFieldLength = 999999; // 字段最大长度
private int mergeFactor = 999999;
private int minMergeDocs = 1000;
private int maxMergeDocs = 99999999; public int MaxMergeDocs
{
get { return maxMergeDocs; }
set { maxMergeDocs = value; }
}
public int MaxFieldLength
{
get { return maxFieldLength; }
set { maxFieldLength = value; }
} public int MergeFactor
{
get { return mergeFactor; }
set { mergeFactor = value; }
} public int MinMergeDocs
{
get { return minMergeDocs; }
set { minMergeDocs = value; }
} public AnalyzerEnum AnalyzerType
{
set { _analyzertype = value; }
get { return _analyzertype; }
} /// <summary>
/// 索引目录
/// </summary>
public string IndexDir
{
get { return _indexdir; }
set { _indexdir = value; }
} /// <summary>
/// 是否重新,还是增量索引
/// </summary>
public bool RebuildIndex
{
get { return _rebuildIndex; }
set { _rebuildIndex = value; }
} }
索引器:
public class Index
{
private IndexParameter param;
public IndexParameter Param
{
get
{
return param;
}
set
{
param = value;
}
} public Index(IndexParameter mParam)
{
this.param = mParam;
} /// <summary>
/// 删除索引
/// </summary>
/// <param name="slist"></param>
/// <returns></returns>
public bool DelelteIndex(List<int> slist)
{
if (!IndexReader.IndexExists(this.param.IndexDir)) return false;
IndexReader reader = IndexReader.Open(this.param.IndexDir); try
{
foreach (int item in slist)
{
reader.DeleteDocument(item);
} }
finally
{
reader.Close();
} return true;
} /// <summary>
/// 删除索引
/// </summary>
/// <param name="sfield"></param>
/// <param name="svalue"></param>
/// <returns></returns>
public bool DelelteIndex(string sfield, string svalue)
{
if (!IndexReader.IndexExists(this.param.IndexDir)) return false; IndexReader reader = IndexReader.Open(this.param.IndexDir);
try
{
reader.DeleteDocuments(new Term(sfield, svalue));
}
catch
{
return true;
}
finally
{
reader.Close();
} return true;
} /// <summary>
/// 删除
/// </summary>
public void EnableChanged()
{
IndexWriter indexwriter = null;
try
{
if (!IndexReader.IndexExists(this.param.IndexDir)) return;
indexwriter = new IndexWriter(this.param.IndexDir, Formater.GetAnalyzer(this.param.AnalyzerType), false);
indexwriter.Optimize();
}
finally
{
if (indexwriter != null)
{
indexwriter.Close();
}
}
} /// <summary>
/// 建立索引
/// </summary>
/// <param name="items"></param>
public void BuildIndex(List<List<IndexItem>> items)
{
lock (this)
{
IndexWriter indexwriter = null; try
{
if (this.param == null)
{
throw new Exception("缺少建立索引参数,param 为空!");
}
if (!IndexReader.IndexExists(this.param.IndexDir) || this.param.RebuildIndex)
{
indexwriter = new IndexWriter(this.param.IndexDir, Formater.GetAnalyzer(this.param.AnalyzerType), true);
}
else
{
indexwriter = new IndexWriter(this.param.IndexDir, Formater.GetAnalyzer(this.param.AnalyzerType), false);
} RAMDirectory _ramDir = new RAMDirectory();
IndexWriter _ramWriter = new IndexWriter(_ramDir, Formater.GetAnalyzer(this.param.AnalyzerType), true); _ramWriter.SetMaxFieldLength(this.param.MaxFieldLength);
_ramWriter.SetMergeFactor(this.param.MergeFactor);
_ramWriter.SetMaxMergeDocs(this.param.MinMergeDocs);
_ramWriter.SetMaxMergeDocs(this.param.MaxMergeDocs); indexwriter.SetMaxFieldLength(this.param.MaxFieldLength);
indexwriter.SetMergeFactor(this.param.MergeFactor);
indexwriter.SetMaxMergeDocs(this.param.MinMergeDocs);
indexwriter.SetMaxMergeDocs(this.param.MaxMergeDocs); foreach (List<IndexItem> item in items)
{
_ramWriter.AddDocument(Doc(item));
}
_ramWriter.Close();
indexwriter.AddIndexes(new Directory[] { _ramDir });
}
catch (Exception err)
{
new Log().WriteLog(err.ToString());
}
finally
{
if (indexwriter != null)
{
indexwriter.Optimize();
indexwriter.Close();
}
}
}
} private Document Doc(List<IndexItem> items)
{
Document doc = new Document();
for (int i = 0; i < items.Count; i++)
{
items[i].FieldName = items[i].FieldName.ToUpper();
if (items[i].IsToLower)
{
items[i].FieldVale = items[i].FieldVale.ToLower();
}
Field file;
switch (items[i].Type)
{
case IndexType.KeyWord:
{
doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.YES, Field.Index.UN_TOKENIZED));
break;
}
case IndexType.Text:
{
doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.YES, Field.Index.TOKENIZED));
break;
}
case IndexType.UnIndexed:
{
doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.NO, Field.Index.UN_TOKENIZED));
break;
}
case IndexType.UnStored:
{
doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.NO, Field.Index.TOKENIZED));
break;
}
default:
{
doc.Add(new Field(items[i].FieldName, items[i].FieldVale, Field.Store.YES, Field.Index.TOKENIZED));
break;
}
}
}
return doc;
}
}索引封装参数:
public enum IndexType
{
KeyWord = 0,
UnIndexed = 1,
UnStored = 2,
Text = 3
}
[Serializable]
public class IndexItem
{
private string fieldName = string.Empty;
private string fieldValue = string.Empty;
private IndexType type;
private bool isToLower = true; /// <summary>
/// 索引名称
/// </summary>
public string FieldName
{
get
{
return fieldName;
}
set
{
fieldName = value;
}
} /// <summary>
/// 索引值
/// </summary>
public string FieldVale
{
get
{
return fieldValue;
}
set
{
fieldValue = value;
}
} /// <summary>
/// 索引类型
/// </summary>
public IndexType Type
{
get
{
return type;
}
set
{
type = value;
}
} /// <summary>
/// 是否允许转成小写
/// </summary>
public bool IsToLower
{
get { return isToLower; }
set { isToLower = value; }
}
} 注意:lucene是不支持更新文档功能的,所以先删除,后重新添加,还有删除的文档必须Optimize之后才真正删掉。
以上是通过BuildIndex方法索来引数据,lucene是支持增量搜索数据,RebuildIndex就是这个作用。
StandardAnalyzer,
ChineseAnalyzer,
DoubleWordAnalyzer,
ChineseWordAnalyzer,
CustomAnalyzer,
SmartSegmentAnalyzer
是分词器,以后将讲述到。
应用举例:public void BuildIndex()
{
List<List<IndexItem>> list = new List<List<IndexItem>>();
DataTable mdt = GetKeywords(); foreach (DataRow dr in mdt.Rows)
{
List<IndexItem> slist = new List<IndexItem>();
IndexItem item; item = new IndexItem();
item.FieldName = "sword";
item.FieldVale = dr["SWord"].ToString();
item.Type = IndexType.Text;
slist.Add(item); item = new IndexItem();
item.FieldName = "sword";
item.FieldVale = dr["SWord"].ToString();
item.Type = IndexType.Text;
slist.Add(item); item = new IndexItem();
item.FieldName = "scount";
item.FieldVale = dr["scount"].ToString();
item.Type = IndexType.KeyWord;
slist.Add(item); item = new IndexItem();
item.FieldName = "rcount";
item.FieldVale = dr["rcount"].ToString();
item.Type = IndexType.KeyWord;
slist.Add(item);
list.Add(slist); }
IndexParameter parm = new IndexParameter();
parm.AnalyzerType = AnalyzerEnum.ChineseAnalyzer;
parm.IndexDir = this.Item.TempDir;
parm.RebuildIndex = true;
new Index(parm).BuildIndex(list);
}
以上就是 lucene 索引文档的方法,实际上,很多地方都有,笔者只是将它们重新封装,侧重说明知道注意的地方。
不明白的,可以自己看看lucene的帮助和demo,不过demo写得太业余了,就将就将就吧。