java hanlp 过滤停用词 java关键字过滤

转载

架构思维大师 2023-11-29 14:21:07

文章标签 java hanlp 过滤停用词敏感词 java System 文章分类 Java 后端开发

Java Springbool敏感词过工具类滤

1. 功能描述
利用前缀树这种数据结构，设计并开发出敏感词过滤工具。

2. 构建敏感词表
resource/sensitive-words.txt

java hanlp 过滤停用词 java关键字过滤_java hanlp 过滤停用词

3. 敏感词过滤器
util/SensitiveUtil.java

构建前缀树
定义过滤方法

package com.wlnl.lanaer.service.api.util;

import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang.CharUtils;
import org.apache.commons.lang.StringUtils;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashMap;
import java.util.Map;

@Slf4j
public class SensitiveUtil {

private static final String FILE_NAME = "sensitive-words.txt";

// 默认替换符
private static final String REPLACEMENT = "***";

// 根节点
private static TrieNode rootNode = new TrieNode();

    static {
try (
                InputStream is = SensitiveUtil.class.getClassLoader().getResourceAsStream(FILE_NAME);
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
) {
            String keyword;
            while ((keyword = reader.readLine()) != null) {
// 添加到前缀树
addKeyword(keyword);
}
        } catch (IOException e) {
log.error("加载敏感词文件失败: " + e.getMessage());
}
    }

// 将一个敏感词添加到前缀树中
private static void addKeyword(String keyword) {
        TrieNode tempNode = rootNode;
        for (int i = 0; i < keyword.length(); i++) {
char c = keyword.charAt(i);
TrieNode subNode = tempNode.getSubNode(c);

            if (subNode == null) {
// 初始化子节点
subNode = new TrieNode();
tempNode.addSubNode(c, subNode);
}

// 指向子节点,进入下一轮循环
tempNode = subNode;

// 设置结束标识
if (i == keyword.length() - 1) {
                tempNode.setKeywordEnd(true);
}
        }
    }

/**
     * 判断是否有关键字
* @param text
* @return
*/
public static boolean hasKeyword(String text) {
if (StringUtils.isBlank(text)) {
return false;
}

// 指针1
TrieNode tempNode = rootNode;
// 指针2
int begin = 0;
// 指针3
int position = 0;

        while (position < text.length()) {
char c = text.charAt(position);

// 跳过符号
if (isSymbol(c)) {
// 若指针1处于根节点,将此符号计入结果,让指针2向下走一步
if (tempNode == rootNode) {
                    begin++;
}
// 无论符号在开头或中间,指针3都向下走一步
position++;
                continue;
}

// 检查下级节点
tempNode = tempNode.getSubNode(c);
            if (tempNode == null) {
// 进入下一个位置
position = ++begin;
// 重新指向根节点
tempNode = rootNode;
} else if (tempNode.isKeywordEnd()) {
// 发现敏感词
return true;
} else {
// 检查下一个字符
position++;
}
        }
return false;
}

/**
     * 过滤敏感词
*
     * @param text 待过滤的文本
* @return 过滤后的文本
*/
public static String filter(String text) {
return filter(text, REPLACEMENT);
}

/**
     * 过滤敏感词
*
     * @param text 待过滤的文本
* @param substitute 敏感词替换字符串
* @return 过滤后的文本
*/
public static String filter(String text, String substitute) {
if (StringUtils.isBlank(text)) {
return null;
}
if (null == substitute) {
            substitute = REPLACEMENT;
}

// 指针1
TrieNode tempNode = rootNode;
// 指针2
int begin = 0;
// 指针3
int position = 0;
// 结果
StringBuilder sb = new StringBuilder();

        while (position < text.length()) {
char c = text.charAt(position);

// 跳过符号
if (isSymbol(c)) {
// 若指针1处于根节点,将此符号计入结果,让指针2向下走一步
if (tempNode == rootNode) {
                    sb.append(c);
begin++;
}
// 无论符号在开头或中间,指针3都向下走一步
position++;
                continue;
}

// 检查下级节点
tempNode = tempNode.getSubNode(c);
            if (tempNode == null) {
// 以begin开头的字符串不是敏感词
sb.append(text.charAt(begin));
// 进入下一个位置
position = ++begin;
// 重新指向根节点
tempNode = rootNode;
} else if (tempNode.isKeywordEnd()) {
// 发现敏感词,将begin~position字符串替换掉
sb.append(substitute);
// 进入下一个位置
begin = ++position;
// 重新指向根节点
tempNode = rootNode;
} else {
// 检查下一个字符
position++;
}
        }

// 将最后一批字符计入结果
sb.append(text.substring(begin));

        return sb.toString();
}

// 判断是否为符号
private static boolean isSymbol(Character c) {
// 0x2E80~0x9FFF 是东亚文字范围
return !CharUtils.isAsciiAlphanumeric(c) && (c < 0x2E80 || c > 0x9FFF);
}

// 前缀树
private static class TrieNode {

// 关键词结束标识
private boolean isKeywordEnd = false;

// 子节点(key是下级字符,value是下级节点)
private Map<Character, TrieNode> subNodes = new HashMap<>();

        public boolean isKeywordEnd() {
return isKeywordEnd;
}

public void setKeywordEnd(boolean keywordEnd) {
isKeywordEnd = keywordEnd;
}

// 添加子节点
public void addSubNode(Character c, TrieNode node) {
subNodes.put(c, node);
}

// 获取子节点
public TrieNode getSubNode(Character c) {
return subNodes.get(c);
}

    }

}

4. 测试过滤敏感词功能

package com.wlnl.lanaer.service.api.mq;

import com.wlnl.lanaer.service.api.KxkdApiServiceApplication;
import com.wlnl.lanaer.service.api.util.BlogKeywordUtil;
import org.junit.jupiter.api.extension.ExtendWith;
import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
import org.springframework.boot.test.context.SpringBootTest;
import org.springframework.test.context.ActiveProfiles;
import org.springframework.test.context.junit.jupiter.SpringExtension;

@SpringBootTest(webEnvironment = SpringBootTest.WebEnvironment.RANDOM_PORT, classes = {KxkdApiServiceApplication.class})
@AutoConfigureMockMvc
@ExtendWith(SpringExtension.class)
@ActiveProfiles(profiles = "dev")
public class TestSensitiveWords {

public static void main(String[] args) {
        String text = "这里可以赌博,可以嫖娼,可以吸毒,哈哈哈!";
String text1 = "这里可以-赌-博-,可以，嫖，娼，,可以=吸=毒=,哈哈哈!";
String text2 = "✔✔✔正规正规";
        boolean bool = BlogKeywordUtil.hasKeyword(text);
System.out.println("是否有关键词：" + bool);
System.out.println("过滤后的文本：" + BlogKeywordUtil.filter(text));

System.out.println("是否有关键词：" + BlogKeywordUtil.hasKeyword(text1));
System.out.println("过滤后的文本：" + BlogKeywordUtil.filter(text1));

System.out.println("是否有关键词：" + BlogKeywordUtil.hasKeyword(text2));
System.out.println("过滤后的文本：" + BlogKeywordUtil.filter(text2));
}

}

执行结果

java hanlp 过滤停用词 java关键字过滤_java_02