开心就好
Trie树的原理不讲了,直接上代码
ChatFilter.java 是核心的过滤器,他从NoneWantToSee.list文件中读敏感词,这个文件中一个敏感词放一行,这个文件放在src目录下就行。
过滤器实现数据加载和提供过滤服务,过滤服务是把敏感词替换成**,可以自定义行为。
和一些例子不同,我在代码中处理了部分重叠状态的识别,比如“丝袜” “丝袜网” 都作为敏感词可以被识别出来并处理掉。
另外有一点,构造使用的Set是TreeSet,其中的元素长度从大到小排列,这样在构造sensitiveMap的时候,重叠匹配处理起来方便一些。
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Comparator;
import java.util.HashMap;
import java.util.TreeSet;
/**
* 聊天过滤器,DFA算法
* @author yuantao
*
*/
public class ChatFilter {
private static HashMap<String, ChatFilterTreeNode> sensitiveMap = new HashMap<>();
static {
File file = new File(ChatFilter.class.getResource("/").getPath()+"NoneWantToSee.list");
TreeSet<String> set = new TreeSet<>(new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
// TODO Auto-generated method stub
return o1.length() > o2.length() ? -1 : 1;
}
});
try(BufferedReader bReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));) {
String line = null;
while ((line = bReader.readLine()) != null) {
set.add(line);
}
initFilter(set);
} catch (Exception e) {
// TODO: handle exception
e.printStackTrace();
}
}
/**
* 只是用来加载静态代码的
*/
public static void initChatFilter(){}
/**
* 构造关键词查询器
* @param keySet 按长度倒叙排列的TreeSet
*/
private static void initFilter(TreeSet<String> keySet) {
for (String oneKey : keySet) {
HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap;
for (int index = 0; index < oneKey.length(); ++index) {
char keyChar = oneKey.charAt(index);
ChatFilterTreeNode node = iterMap.get(String.valueOf(keyChar)); // 按一个字符查找
if (node != null) { //如果存在尝试下探
if (index < (oneKey.length()-1)) {
node.setEnd(false);
node.setOverLapEnd(false);
} else { //部分匹配
if (!node.getNextNodeMap().isEmpty()) {
node.setEnd(false);
node.setOverLapEnd(true);
}
}
iterMap = node.getNextNodeMap();
} else {
//不存在就构造
ChatFilterTreeNode nextNewNode = new ChatFilterTreeNode();
if (index < (oneKey.length()-1)) {
nextNewNode.setEnd(false);
}
iterMap.put(String.valueOf(keyChar), nextNewNode);
iterMap = nextNewNode.getNextNodeMap();
}
}
}
}
public String filte(String targetStr) {
HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap;
StringBuilder sb = new StringBuilder();
boolean needProcessOverlap = false;
int sensitivityIndex = 0; // 标记敏感词起始位置
int normalStartIndex = 0; // 标记正常词起始位置
int normalEndIndex = 0; // 标记正常词结束位置
for (int index = 0; index < targetStr.length(); index++) {
char inputChar = targetStr.charAt(index);
ChatFilterTreeNode node = iterMap.get(String.valueOf(inputChar));
if (node != null) {
iterMap = node.getNextNodeMap();
if (node.isEnd()) {
//匹配上了先替换敏感词再调整索引值
if (normalEndIndex > normalStartIndex) { //先截取前面的非敏感词部分
sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
}
sb.append("**");
normalStartIndex = index + 1;
sensitivityIndex = index + 1;
normalEndIndex = index + 1;
iterMap = sensitiveMap;
needProcessOverlap = false;
} else if (node.isOverLapEnd()) {
needProcessOverlap = true;
}
} else {
if (needProcessOverlap) { //处理重叠匹配的状态
if (normalEndIndex > normalStartIndex) {
sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
}
sb.append("**");
needProcessOverlap = false;
normalStartIndex = index;
sensitivityIndex = index;
normalEndIndex = index;
}
//这里要尝试去匹配一下, 如果匹配了一半退出了,需要重新去匹配
iterMap = sensitiveMap;
node = iterMap.get(String.valueOf(inputChar));
if (node != null) {
normalEndIndex = index;
sensitivityIndex = index;
iterMap = node.getNextNodeMap();
if (node.isEnd()) {
//匹配上了先替换敏感词再调整索引值
if (normalEndIndex > normalStartIndex) { //先截取前面的非敏感词部分
sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
}
sb.append("**");
normalStartIndex = index + 1;
sensitivityIndex = index + 1;
normalEndIndex = index + 1;
iterMap = sensitiveMap;
}
} else {
//1.正常start=正常end=敏感start,让正常end=index,
if (normalEndIndex == normalStartIndex
&& normalEndIndex == sensitivityIndex) {
iterMap = sensitiveMap;
sensitivityIndex = normalStartIndex;
}
normalEndIndex = index + 1;
}
}
}
if (needProcessOverlap) {
if (normalStartIndex < normalEndIndex) {
sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
}
sb.append("**");
normalStartIndex = targetStr.length();
sensitivityIndex = targetStr.length();
normalEndIndex = targetStr.length();
}
if (normalStartIndex < targetStr.length()) {
sb.append(targetStr.substring(normalStartIndex));
}
return sb.toString();
}
}
数据结构 ChatFilterTreeNode.java 两个标记,一个(isEnd)是标记叶子节点,一个(isOverLapEnd)是标记被覆盖的叶子节点。
import java.util.HashMap;
/**
* 聊天过滤器查找树的节点。
* 在查找树中,每一个Key都必须对应一个节点
* 最后一个Key对应的节点中isEnd==true,nextNodeMap.size==0
* @author yuantao
*
*/
public class ChatFilterTreeNode {
private boolean isEnd = true;
private HashMap<String, ChatFilterTreeNode> nextNodeMap = null;
private boolean isOverLapEnd = false;
/**
* Lazy Getter and Setter
* @return
*/
public HashMap<String, ChatFilterTreeNode> getNextNodeMap() {
if (nextNodeMap==null) {
nextNodeMap = new HashMap<String, ChatFilterTreeNode>();
}
return nextNodeMap;
}
public void setNextNodeMap(HashMap<String, ChatFilterTreeNode> nextNodeMap) {
this.nextNodeMap = nextNodeMap;
}
public boolean isEnd() {
return isEnd;
}
public void setEnd(boolean isEnd) {
this.isEnd = isEnd;
}
public boolean isOverLapEnd() {
return isOverLapEnd;
}
public void setOverLapEnd(boolean isOverLapEnd) {
this.isOverLapEnd = isOverLapEnd;
}
}
用法很简单
ChatFilter filter =new
testStr = "啊日本人丝袜敏网啊日本人敏网丝袜网我日本丝袜日本";
System.out.println(testStr);
String result =filter.filte(testStr);
System.out.println(result);
敏感词是[丝袜, 丝袜网]
啊日本人丝袜敏网啊日本人敏网丝袜网我日本丝袜日本
啊日本人**敏网啊日本人敏网**我日本**日本