开心就好

Trie树的原理不讲了,直接上代码

ChatFilter.java 是核心的过滤器,他从NoneWantToSee.list文件中读敏感词,这个文件中一个敏感词放一行,这个文件放在src目录下就行。

过滤器实现数据加载和提供过滤服务,过滤服务是把敏感词替换成**,可以自定义行为。

和一些例子不同,我在代码中处理了部分重叠状态的识别,比如“丝袜” “丝袜网” 都作为敏感词可以被识别出来并处理掉。

另外有一点,构造使用的Set是TreeSet,其中的元素长度从大到小排列,这样在构造sensitiveMap的时候,重叠匹配处理起来方便一些。


import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.util.Comparator;
import java.util.HashMap;
import java.util.TreeSet;

/**
 * 聊天过滤器,DFA算法
 * @author yuantao
 *
 */
public class ChatFilter {
    private static HashMap<String, ChatFilterTreeNode> sensitiveMap = new HashMap<>();
    
    static {
        File file = new File(ChatFilter.class.getResource("/").getPath()+"NoneWantToSee.list");
        TreeSet<String> set = new TreeSet<>(new Comparator<String>() {
            @Override
            public int compare(String o1, String o2) {
                // TODO Auto-generated method stub
                return o1.length() > o2.length() ? -1 : 1;
            }
        });
        try(BufferedReader bReader = new BufferedReader(new InputStreamReader(new FileInputStream(file)));) {
            String line = null;
            
            while ((line = bReader.readLine()) != null) {
                set.add(line);
            }
            initFilter(set);
        } catch (Exception e) {
            // TODO: handle exception
            e.printStackTrace();
        }
    }
    
    /**
     * 只是用来加载静态代码的
     */
    public static void initChatFilter(){}
    /**
     * 构造关键词查询器
     * @param keySet 按长度倒叙排列的TreeSet
     */
    private static void initFilter(TreeSet<String> keySet) { 
        for (String oneKey : keySet) {
            HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap;
            for (int index = 0; index < oneKey.length(); ++index) {
                char keyChar = oneKey.charAt(index);
                ChatFilterTreeNode node = iterMap.get(String.valueOf(keyChar)); // 按一个字符查找 
                if (node != null) { //如果存在尝试下探
                    if (index < (oneKey.length()-1)) {
                        node.setEnd(false);
                        node.setOverLapEnd(false);
                    } else { //部分匹配
                        if (!node.getNextNodeMap().isEmpty()) {
                            node.setEnd(false);
                            node.setOverLapEnd(true);
                        }
                    }
                    iterMap = node.getNextNodeMap();
                } else {
                    //不存在就构造
                    ChatFilterTreeNode nextNewNode = new ChatFilterTreeNode();
                    if (index < (oneKey.length()-1)) {
                        nextNewNode.setEnd(false);
                    }
                    iterMap.put(String.valueOf(keyChar), nextNewNode);
                    iterMap = nextNewNode.getNextNodeMap();
                }
            }
        }
    }
    
    
    public String filte(String targetStr) {
        HashMap<String, ChatFilterTreeNode> iterMap = sensitiveMap;
        StringBuilder sb = new StringBuilder();
        boolean needProcessOverlap = false;
        int sensitivityIndex = 0; // 标记敏感词起始位置
        int normalStartIndex = 0; // 标记正常词起始位置
        int normalEndIndex = 0; // 标记正常词结束位置 
        for (int index = 0; index < targetStr.length(); index++) {
            char inputChar = targetStr.charAt(index);
            ChatFilterTreeNode node = iterMap.get(String.valueOf(inputChar));
            if (node != null) {
                iterMap = node.getNextNodeMap();
                if (node.isEnd()) {
                    //匹配上了先替换敏感词再调整索引值
                    if (normalEndIndex > normalStartIndex) { //先截取前面的非敏感词部分
                        sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
                    }
                    sb.append("**");
                    
                    normalStartIndex = index + 1;
                    sensitivityIndex = index + 1;
                    normalEndIndex = index + 1;
                    iterMap = sensitiveMap;
                    needProcessOverlap = false;
                    
                } else if (node.isOverLapEnd()) {
                    needProcessOverlap = true;
                }
                
            } else { 
                if (needProcessOverlap) { //处理重叠匹配的状态
                    if (normalEndIndex > normalStartIndex) {
                        sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
                    }
                    sb.append("**");
                    needProcessOverlap = false;
                    normalStartIndex = index;
                    sensitivityIndex = index;
                    normalEndIndex = index;
                }
                //这里要尝试去匹配一下, 如果匹配了一半退出了,需要重新去匹配
                iterMap = sensitiveMap;
                node = iterMap.get(String.valueOf(inputChar));
                if (node != null) {
                    normalEndIndex = index;
                    sensitivityIndex = index;
                    iterMap = node.getNextNodeMap();
                    if (node.isEnd()) {
                        //匹配上了先替换敏感词再调整索引值
                        if (normalEndIndex > normalStartIndex) { //先截取前面的非敏感词部分
                            sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
                        }
                        sb.append("**");
                        
                        normalStartIndex = index + 1;
                        sensitivityIndex = index + 1;
                        normalEndIndex = index + 1;

                        iterMap = sensitiveMap;
                    }
                    
                } else {
                    //1.正常start=正常end=敏感start,让正常end=index,
                    if (normalEndIndex == normalStartIndex 
                            && normalEndIndex == sensitivityIndex) {
                        iterMap = sensitiveMap;
                        sensitivityIndex = normalStartIndex;
                    }
                    normalEndIndex = index + 1;
                }
                
            }
        }
        if (needProcessOverlap) {
            if (normalStartIndex < normalEndIndex) {
                sb.append(targetStr.substring(normalStartIndex, normalEndIndex));
            }
            sb.append("**");
            normalStartIndex = targetStr.length();
            sensitivityIndex = targetStr.length();
            normalEndIndex = targetStr.length();
        }
        if (normalStartIndex < targetStr.length()) {
            sb.append(targetStr.substring(normalStartIndex));
        }

        return sb.toString();
    }
}



数据结构 ChatFilterTreeNode.java 两个标记,一个(isEnd)是标记叶子节点,一个(isOverLapEnd)是标记被覆盖的叶子节点。


import java.util.HashMap;
/**
 * 聊天过滤器查找树的节点。
 * 在查找树中,每一个Key都必须对应一个节点
 * 最后一个Key对应的节点中isEnd==true,nextNodeMap.size==0
 * @author yuantao
 *
 */
public class ChatFilterTreeNode {
        private boolean isEnd = true;
        private HashMap<String, ChatFilterTreeNode> nextNodeMap = null;
        private boolean isOverLapEnd = false;
        /**
         * Lazy Getter and Setter
         * @return
         */
        
        public HashMap<String, ChatFilterTreeNode> getNextNodeMap() {
            if (nextNodeMap==null) {
                nextNodeMap = new HashMap<String, ChatFilterTreeNode>();
            }
            return nextNodeMap;
        }
        
        public void setNextNodeMap(HashMap<String, ChatFilterTreeNode> nextNodeMap) {
            this.nextNodeMap = nextNodeMap;
        }
        
        public boolean isEnd() {
            return isEnd;
        }
        
        public void setEnd(boolean isEnd) {
            this.isEnd = isEnd;
        }

        public boolean isOverLapEnd() {
            return isOverLapEnd;
        }

        public void setOverLapEnd(boolean isOverLapEnd) {
            this.isOverLapEnd = isOverLapEnd;
        }
}



用法很简单


    ChatFilter filter =new

testStr = "啊日本人丝袜敏网啊日本人敏网丝袜网我日本丝袜日本";

    System.out.println(testStr);

    String result =filter.filte(testStr);

    System.out.println(result);

敏感词是[丝袜, 丝袜网]

啊日本人丝袜敏网啊日本人敏网丝袜网我日本丝袜日本

啊日本人**敏网啊日本人敏网**我日本**日本