java 关键词过滤关键字过滤

转载

bingfeng 2023-07-21 18:01:13

文章标签 java 关键词过滤关键字过滤算法字符串散列表 文章分类 Java 后端开发

因为过滤关键字机制到处可见，于是聪明的网友就会想到各种各样的方法突破，例如：
1、中文会用繁体字的方法避开关键字扫描
2、在关键字中间插入无意思的特殊字符，例如 * & # @ 等，而且个数可变
3、使用谐音或拆字法变换关键字
在实现自己的算法时也有些问题：
4、随着时间推移，关键字列表会越来越大，有些论坛常用的正则表达式N次扫描的方法显得效率很低。
5、关键字有不同的严重级别，有些需要禁止，有些只需要替换，还有一些可能记录一下即可。
针对这些问题，可采用的应对方法：
1、加载关键字列表时，将所有的关键字转换成繁体字一份，以扫描繁体版的关键字；
这个转换工作只需一句就可以实现了：

• s=Microsoft.VisualBasic.Strings.StrConv(word, Microsoft.VisualBasic.VbStrConv.TraditionalChinese, 0);

2、在扫描原文本时，如果遇到关键字的首个文字，忽略其后的特殊字符，直到下一个有意义的文字为止，当然这里需要在定义关键字列表时指定哪些才需要这样扫描，并不是所有关键字都采用这种方式；
例如有关键字 “你好”经常会被人输入成“你x好”或者“你xxxxx好”，那么在关键字列表里就需要定义成“你*好”，在匹配关键字时，如果遇到星号就忽略原文本下一个为特殊的字符。
3、遇到谐音和拆字时，没什么好办法了，只好将这些谐音词和拆分词也加入到关键字列表。
4、不用正则表达式或者 String.IndexOf方法，可以将所有关键字的首字相同的组成一个一个小组，然后在将首字放到一个散列表（HashTable/Dictionary<T>），在扫描原文本时先在散列表里扫描，如果碰到了首字再扫描同组的关键字，这样简单处理一下效率可以提高很多。
还有一个比用散列表更好的方法，将散列表改成一个大小为char.MaxValue的数组，然后将首个文字转成int，即char->int，然后将关键词集合放到相应下标里。这样在扫描原文本时，将被扫描的字符转成int，然后试探数组相应下标的元素是否不为NULL。这样比用散列表会更快一些。
5、在定义关键字时，同时给一个“级别”属性，例如使用 E,R,B分别表示只记录、替换、禁止等情况。
于是关键字的列表如下所示：
你滚 E
他niang的 R
这里贴一下关键的部分代码：

• Code    
• private WordGroup[] _wordTable;    
•   
• public FilterResult Filter(ref string source,char replaceChar)    
• {    
• //NOTE::    
• // 如果方法返回 FilterResult.Replace或者FilterResult.Banned，则原字符串的某些字会被替代为星号，替代后的字符串可以由source取得    
•   
• if (String.IsNullOrEmpty(source)) return FilterResult.Pass;    
•   
• FilterResult result = FilterResult.Pass;    
• char[] tempString = null;    
•   
• int start = 0;    
• for (; start < source.Length; start++)    
• {    
• WordGroup fw = _wordTable[fastToLower(source[start])];    
• if (fw != null)    
• {    
• for (int idx = 0; idx < fw.Count; idx++)    
• {    
• WordEntity we = fw.GetItem(idx);    
• int matchLength=0;    
• if (we.Word.Length==0 || checkString(source, we.Word, start + 1, out matchLength))    
• {    
• FilterResult fr = we.HandleType;    
• if (fr > result) result = fr; //记录最高级别的处理方法    
• if (fr == FilterResult.Replace || fr == FilterResult.Banned)    
• {    
• //替换关键字    
• if(tempString==null) tempString =source.ToCharArray();;    
• for (int pos = 0; pos < matchLength + 1; pos++)    
• {    
• tempString[pos + start] = replaceChar;    
• }    
• }    
• }    
• }    
• }    
• }    
•   
• if (result > FilterResult.RecordOnly)    
• {    
• source = new string(tempString);    
• }    
•   
• return result;    
• }    
•   
• private bool checkString(string source, string keyword, int sourceStart, out int matchLength)    
• {    
• bool found = false;    
• int sourceOffset = 0;    
• int keyIndex = 0;    
• for (; keyIndex < keyword.Length; keyIndex++)    
• {    
• if (sourceOffset + sourceStart >= source.Length) break; //原始字符串已经全部搜索完毕    
•   
• if (keyword[keyIndex] == '*')    
• {    
• //跳过可忽略的字符    
• while (sourceOffset + sourceStart < source.Length)    
• {    
• if (isIgnorableCharacter_CN(source[sourceOffset + sourceStart]))    
• sourceOffset++;    
• else    
• break;    
• }    
• }    
• else    
• {    
• //比较字母    
• if (fastToLower(source[sourceOffset + sourceStart]) == (int)keyword[keyIndex])    
• {    
• if (keyIndex == keyword.Length - 1)    
• {    
• found = true;    
• break;    
• }    
• }    
• else    
• {    
• break;    
• }    
•   
• sourceOffset++;//移动原始字符串    
• }    
• }    
•   
• //如果匹配中关键字，则返回原字符串中被匹配中的文字的长度，否则返回0    
• matchLength = sourceOffset + 1;    
• return found;    
• }    
•   
• private int fastToLower(char character)    
• {    
• //将大写英文字母以及全/半角的英文字母转化为小写字母    
• int charVal = (int)character;    
• if (charVal <= 90)    
• {    
• if (charVal >= 65) //字母A-Z    
• return charVal - 65 + 97;    
• }    
• else if (charVal >= 65313)    
• {    
• if (charVal <= 65338)    
• return charVal - 65313 + 97; //全角大写A-Z    
• else if (charVal >= 65345 && charVal <= 65370)    
• return charVal - 65345 + 97; //全角小写a-z    
• }    
• return charVal;    
• }    
•   
• private bool isIgnorableCharacter_CN(char character)    
• {    
• //NOTE::    
• // 中文表意字符的范围 4E00-9FA5    
• int charVal = (int)character;    
• return !(charVal >= 0x4e00 && charVal <= 0x9fa5);    
• }    
•   
• // 单个过滤词条目    
• class WordEntity    
• {    
• public string Word { get; set; }    
• public FilterResult HandleType { get; set; }    
• }    
•   
• // 过滤词的组    
• class WordGroup    
• {    
• //NOTE::用于装载一组具有同一个字开头的过滤词    
•   
• private List<WordEntity> _words;    
•   
• public WordGroup()    
• {    
• _words = new List<WordEntity>();    
• }    
•   
• public void AppendWord(string word, FilterResult handleType)    
• {    
• AppendWord(new WordEntity() { Word = word, HandleType = handleType });    
• }    
•   
• public void AppendWord(WordEntity word)    
• {    
• _words.Add(word);    
• }    
•   
• public int Count    
• {    
• get { return _words.Count; }    
• }    
•   
• public WordEntity GetItem(int index)    
• {    
• return _words[index];    
• }    
• }

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。