一直做搜索,用的ik,但是用ik的话只能按照ik里面的字典去做分词不太满足自己的场景,但每个分词的原始属性你却没办法打上标签,于是就想自己写一个字典树用最长匹配规则取分词,然后封装自己的标签属性,我也不知道这样玩对不对,反正是写了一个,也不知道自己写的算不算字典树,自己封的 哈哈 ,自己是个小白不知道里面有没有bug,欢迎指正,我自己用了4000w数据怼进去了数据长度大概是10个长度以内,内存用了16g左右吧,没报错,识别也正常,
字典树是一个前缀搜索树,可以用于做敏感字词匹配,也可以做智能提示,但长度不要太长,容易内存占用过大,可以使用双数组字典树(DAT double array tire )解决内存占用问题,但是使用DAT要提前排序,他不是一个平衡树可能出现某一条支链很庞大,某一条支链只有几条数据的问题。
//tire的数据结构
package com.tire;
import java.util.List;
import java.util.Map;
public class TireNode {
public char c;
public boolean hasNext;
public TireNode pre;
public TireNode next;
public List<Object> pos;
public List<Object> type;
public List<Object> cityCode;
public Map<Character, TireNode> subTire;
public TireNode() {
}
public TireNode(char c, TireNode pre, TireNode next) {
this.c = c;
this.pre = pre;
this.next = next;
}
public TireNode(TireNode node, Map<Character, TireNode> subTire) {
node.subTire = subTire;
}
public TireNode(char c) {
this(c, null, null);
}
}
//操作方法
package com.tire;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.store.FSDirectory;
import com.tianditu.util.Util;
/**
* 规定有next 就没有subMap
*
* @author wangnanhui
*
*/
public class TireTree {
public int DEFAULT_SIZE = 500;
public int size = DEFAULT_SIZE;
public Map<Character, TireNode> dic = new HashMap<Character, TireNode>(size);
/**
* put 数据
*
* @param chs
* @param type
* 数据类型
* @param citycode
* 行政区划码
* @param pos
* 所在位置
*/
private void putDateToDic(char[] chs, Object type, Object cityCode,
Object pos) {
char ch = chs[0];
TireNode node = dic.get(ch);
if (node == null) {
putNoExistToDic(chs, type, cityCode, pos);
} else {
putExistToDic(Util.copyChar(chs, 1, chs.length - 1), node, type,
cityCode, pos);
}
setSize();
}
/**
* 存放存在的数据
*/
void putExistToDic(char[] chs, TireNode subRoot, Object type,
Object cityCode, Object pos) {
TireNode curret = subRoot;
// 等于null 说明是单链
if (curret.next != null) {// 下一个节点不为空
putData(chs, 0, curret, type, cityCode, pos);
} else if (curret.subTire != null) {// subTire 不为空
putData(chs, 0, curret, type, cityCode, pos);
} else {// 直接添加就行
putData(chs, 0, curret, type, cityCode, pos);
}
}
void putData(char[] chs, int i, TireNode node, Object type,
Object cityCode, Object pos) {
if (i == chs.length) {
if (node.cityCode == null) {
node.cityCode = new ArrayList<>();
}
if (node.type == null) {
node.type = new ArrayList<>();
}
if (node.pos == null)
node.pos = new ArrayList<>();
node.pos.add(pos);
node.cityCode.add(cityCode);
node.type.add(type);
return;
}
node.hasNext = true;
if (node.next != null) {// 下一个节点不为空
if (node.next.c == chs[i++]) {
chs = Util.copyChar(chs, i, chs.length - i);
putData(chs, 0, node.next, type, cityCode, pos);
} else {
if (node.subTire == null) {
if (chs == null || chs.length == 0) {
return;
}
node.subTire = new HashMap<Character, TireNode>();
TireNode newNode = createNewTire(chs, node, null, type,
cityCode, pos);
TireNode oldNode = node.next;
node.subTire.put(newNode.c, newNode);
node.subTire.put(oldNode.c, oldNode);
node.next = null;
node.hasNext = false;
return;
} else {
TireNode current = node.subTire.get(chs[i]);
if (current == null) {
current = createNewTire(
Util.copyChar(chs, i, chs.length - i), node,
null, type, cityCode, pos);
node.subTire.put(current.c, current);
} else {
putData(Util.copyChar(chs, i, chs.length - i), 0,
node.next, type, cityCode, pos);
}
}
}
} else if (node.subTire != null) {// subTire 不为空
TireNode current = node.subTire.get(chs[i++]);
if (current == null) {
current = createNewTire(chs, node, null, type, cityCode, pos);
node.subTire.put(current.c, current);
} else {
putData(Util.copyChar(chs, i, chs.length - i), 0, current,
type, cityCode, pos);
}
} else {// 直接添加就行
TireNode newNode = createNewTire(
Util.copyChar(chs, i, chs.length - i), node, null, type,
cityCode, pos);
node.next = newNode;
node.hasNext = true;
}
}
/**
* 存放不存在的数据
*/
void putNoExistToDic(char[] chs, Object type, Object cityCode, Object pos) {
dic.put(chs[0], createNewTire(chs, null, null, type, cityCode, pos));
}
/**
* 如果字典中包含 abc 待存入词为abcd , abc的subNode还有其他词 如abce ,abcf , 那么就相当于是直接把 d
* 放入SubNode 里 或者获取subNode 销毁abc, 将map连接到新的abc上然后将d放入
*
* @param chs
* @param pre
* @param next
* @param subNode
* @return
*/
TireNode put(char ch, Map<Character, TireNode> subNode) {
return null;
}
/**
* 正常情况下put 数据
*
* @param chs
* @param pre
* @param next
* @return
*/
TireNode put(char[] chs, TireNode pre, TireNode next) {
return null;
}
/**
* 根据匹配关键字获取数据
*
* @param matchWords
* @param useFull
* 是否全匹配
* @return
*/
public TireNode get(String matchWords, boolean useFull) {
if (Util.nullValue(matchWords))
return null;
char[] chs = matchWords.toCharArray();
return get(chs, useFull);
}
/**
* @param chs
* @return
*/
TireNode get(char[] chs, boolean useFull) {
char ch = chs[0];
TireNode node = dic.get(ch);
if (chs.length == 1)
return node;
if (node == null) {
return null;
} else {
return get(chs, node, 1, useFull);
}
}
public TireNode getTire(String matchWords, boolean useFull) {
if (Util.nullValue(matchWords))
return null;
return this.get(matchWords.toCharArray(), useFull);
}
public TireNode getTire(String matchWords) {
if (Util.nullValue(matchWords))
return null;
return this.get(matchWords.toCharArray(), false);
}
public String getTireName(TireNode node) {
List<Character> c = new ArrayList<>();
while (node != null) {
c.add(node.c);
node = node.pre;
}
String w = "";
for (int i = c.size() - 1; i >= 0; i--) {
w += c.get(i);
}
// System.out.println(w);
return w;
}
private List<String> getAllTireName(TireNode node, List<String> list,
String prefix) {
prefix = getTireName(node);
if (node.cityCode != null) {
list.add(prefix);
}
if (node.next != null) {
prefix += node.next.c;
getAllTireName(node.next, list, prefix);
} else if (node.subTire != null) {
for (char c : node.subTire.keySet()) {
TireNode current = node.subTire.get(c);
prefix += c;
getAllTireName(current, list, prefix);
}
}
return list;
}
public List<String> getAllTireName(TireNode node, List<String> list) {
return getAllTireName(node, list, null);
}
TireNode get(char[] chs, TireNode node, int pos, boolean useFull) {
if (chs == null || chs.length == 0)
return node;
if (node.next != null) {// 循环查找
if (node.next.c == chs[pos++]) {
return get(Util.copyChar(chs, pos, chs.length - pos),
node.next, 0, useFull);
} else {
if (chs.length > 0 && useFull)
return null;
return node;
}
} else if (node.subTire != null) {// 在subMap里面查找
TireNode current = node.subTire.get(chs[pos++]);
if (current != null) {
return get(Util.copyChar(chs, pos, chs.length - pos), current,
0, useFull);
} else {
if (chs.length > 0 && useFull)
return null;
return node;
}
} else {// 返回当前的节点就行
if (chs.length > 0 && useFull)
return null;
return node;
}
}
public void setSize() {
this.size = dic.size();
}
public void put(String words, Object type, Object cityCode, Object pos) {
if (Util.nullValue(words))
return;
putDateToDic(words.toCharArray(), type, cityCode, pos);
}
/**
* 生成新节点 , 如果有父节点或者儿子节点直接添加就行
*
* @param chs
* @param pre
* @param next
* @return
*/
public TireNode createNewTire(char[] chs, TireNode pre, TireNode next,
Object type, Object cityCode, Object pos) {
TireNode head = new TireNode(chs[0], pre, next);
TireNode current = head;
if (chs.length == 1) {
if (Util.nullValue(current.type)) {
current.type = new ArrayList<>();
}
if (Util.nullValue(current.cityCode)) {
current.cityCode = new ArrayList<>();
}
if (Util.nullValue(current.pos)) {
current.pos = new ArrayList<>();
}
current.cityCode.add(cityCode);
current.type.add(type);
current.pos.add(pos);
current.hasNext = false;
}
for (int i = 1; i < chs.length; i++) {
TireNode nodeNext = new TireNode(chs[i]);
if (i == chs.length - 1) {
// 结束
if (Util.nullValue(nodeNext.type)) {
nodeNext.type = new ArrayList<>();
}
if (Util.nullValue(nodeNext.cityCode)) {
nodeNext.cityCode = new ArrayList<>();
}
if (Util.nullValue(nodeNext.pos)) {
nodeNext.pos = new ArrayList<>();
}
nodeNext.cityCode.add(cityCode);
nodeNext.type.add(type);
nodeNext.pos.add(pos);
} else {
nodeNext.hasNext = true;
}
current.next = nodeNext;
nodeNext.pre = current;
current = current.next;
}
return head;
}
public static void main(String[] args) throws IOException {
TireTree t = new TireTree();
IndexReader reader = IndexReader.open(FSDirectory.open(new File(
"D:/index/AdminIndex")));
int maxDoc = reader.maxDoc();
for (int i = 0; i < maxDoc; i++) {
Document doc = reader.document(i);
String name = doc.get("name");
String type = doc.get("type");
String cityCode = doc.get("totalcity");
if (!Util.nullValue(name)) {
t.putDateToDic(name.toCharArray(), type, cityCode, i);
}
}
t.putDateToDic("今天天气很好适合出去玩".toCharArray(), "123", "123",1);
t.putDateToDic("今天天气很不好不适合出去玩".toCharArray(), "123", "123",1);
TireNode node = t.get("覃塘镇".toCharArray(), true);
System.out.println(t.getTireName(node));
node = t.get("今天天气".toCharArray(), false);
List<String> list = new ArrayList<>();
t.getAllTireName(node, list, null);
System.out.println(list);
/*
* t.putDateToDic("abc".toCharArray(), "123", "123");
* t.putDateToDic("abcd".toCharArray(), "123", "123");
* t.putDateToDic("abcde".toCharArray(), "123", "123");
* t.putDateToDic("abcdef".toCharArray(), "123", "123");
* t.putDateToDic("abcdefg".toCharArray(), "123", "123");
* t.putDateToDic("abcdefgh".toCharArray(), "123", "123");
* t.putDateToDic("bc".toCharArray(), "123", "123");
* t.putDateToDic("bcd".toCharArray(), "123", "123");
* t.putDateToDic("cde".toCharArray(), "123", "123");
* t.putDateToDic("cdef".toCharArray(), "123", "123");
* t.putDateToDic("defg".toCharArray(), "123", "123");
* t.putDateToDic("dfg".toCharArray(), "123", "123");
* t.putDateToDic("dgfe".toCharArray(), "123", "123");
* t.putDateToDic("dg啊fe".toCharArray(), "123", "123");
* t.putDateToDic("daefg".toCharArray(), "123", "123");
*
* t.putDateToDic("张".toCharArray(), "123", "123");
* t.putDateToDic("张三".toCharArray(), "123", "123");
*
* t.putDateToDic("张四".toCharArray(), "123", "123");
* t.putDateToDic("张四五".toCharArray(), "123", "123");
*
* t.putDateToDic("张五六".toCharArray(), "123", "123");
* t.putDateToDic("张五七".toCharArray(), "123", "123");
*
* t.putDateToDic("张六二".toCharArray(), "123", "1231314");
* t.putDateToDic("张六一".toCharArray(), "123", "12312313");
* t.putDateToDic("张六二".toCharArray(), "123", "1231321");
* t.putDateToDic("张六一".toCharArray(), "123", "12312313");
* t.putDateToDic("张六二".toCharArray(), "123", "123132132");
* t.putDateToDic("张六一".toCharArray(), "123", "1231231");
*
* t.putDateToDic("efgh".toCharArray(), "123", "123");
*/
System.out.println();
}
}