Last active
November 9, 2022 11:19
-
-
Save zpf124/507811a9c4bdd91bf19c3916f11db1dd to your computer and use it in GitHub Desktop.
敏感词过滤代码(DFA算法)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package me.zfly.demo.sensitiveword; | |
import java.io.*; | |
import java.util.ArrayList; | |
import java.util.HashMap; | |
import java.util.List; | |
import java.util.Map; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
public class Filter { | |
private class TrieNode { | |
char content; | |
Map<Character, TrieNode> childNodes; | |
boolean end; | |
TrieNode() { | |
this.childNodes = new HashMap<>(); | |
this.end = false; | |
} | |
TrieNode(char content) { | |
this(); | |
this.content = content; | |
} | |
} | |
private TrieNode root; | |
private int size; | |
private Pattern pattern; | |
public Filter() { | |
// 正则表达式用于过滤标点符号(类似敏*感·词),对性能有一定影响 | |
String regex = "[\\pP\\pZ\\pS\\pM\\pC]"; | |
this.pattern = Pattern.compile(regex); | |
this.root = new TrieNode(); | |
this.size = 0; | |
} | |
public int size() { | |
return size; | |
} | |
public void insert(String word) { | |
if (word != null && !word.equals("")) { | |
TrieNode node = this.root; | |
for (char c : word.toCharArray()) { | |
TrieNode childNode = node.childNodes.get(c); | |
if (childNode == null) { | |
childNode = new TrieNode(c); | |
node.childNodes.put(c, childNode); | |
} | |
node = childNode; | |
} | |
node.end = true; | |
this.size++; | |
} | |
} | |
public void insert(List<String> words) { | |
for (String word : words) { | |
insert(word); | |
} | |
} | |
public void clear() { | |
this.root.childNodes.clear(); | |
this.size = 0; | |
} | |
public boolean matches(String string) { | |
Matcher matcher = pattern.matcher(string); | |
String str = matcher.replaceAll(""); | |
TrieNode node = root; | |
for (char c : str.toCharArray()) { | |
node = node.childNodes.get(c); | |
if (node == null) { | |
break; | |
} else if (node.end) { | |
return true; | |
} | |
} | |
return false; | |
} | |
private int search(String string, int fromIndex) { | |
TrieNode node = root; | |
int matchLength = 0; | |
int length = string.length(); | |
boolean flags = false; | |
for (int i = fromIndex; i < length; i++) { | |
char c = string.charAt(i); | |
if (pattern.matcher(c + "").matches()) { | |
matchLength++; | |
continue; | |
} | |
node = node.childNodes.get(c); | |
if (node != null) { | |
matchLength++; | |
if (node.end) { | |
flags = true; | |
break; | |
} | |
} else { | |
break; | |
} | |
} | |
if (flags && matchLength > 1) { | |
return matchLength; | |
} | |
return 0; | |
} | |
public List<String> search(String string) { | |
List<String> sensitiveWords = new ArrayList<>(); | |
int length = string.length(); | |
for (int i = 0; i < length; ) { | |
int matchLength = search(string, i); | |
if (matchLength > 0) { | |
sensitiveWords.add(string.substring(i, i + matchLength)); | |
i += matchLength; | |
} else { | |
i++; | |
} | |
} | |
return sensitiveWords; | |
} | |
private List<String> readSensitivewordFile() throws Exception { | |
ArrayList<String> list = new ArrayList<>(); | |
File file = new File("words.dic"); //读取文件 | |
try (InputStreamReader read = new InputStreamReader(new FileInputStream(file), "UTF-8")) { | |
if (file.isFile() && file.exists()) { //文件流是否存在 | |
BufferedReader bufferedReader = new BufferedReader(read); | |
String txt = null; | |
while ((txt = bufferedReader.readLine()) != null) { //读取文件,将文件内容放入到set中 | |
list.add(txt); | |
} | |
} else { //不存在抛出异常信息 | |
throw new Exception("敏感词库文件不存在"); | |
} | |
} catch (IOException e) { | |
e.printStackTrace(); | |
} | |
return list; | |
} | |
public static void main(String[] args) throws Exception { | |
Filter filter = new Filter(); | |
List<String> list = filter.readSensitivewordFile(); | |
filter.insert(list); | |
System.out.println("敏感词的数量:" + filter.size()); | |
String string = "太多的伤感情怀也许只局限于饲养基地 荧幕中的情节,主人公尝试着去用某种方式渐渐的很潇洒地释自杀指南怀那些自己经历的伤感。" | |
+ "然后法轮功 我们的扮演的角色就是跟随着主人公的喜怒哀乐而过于牵强的把自己的情感也附加于 成* 人* 小* 说银幕情节中,然后感动就流泪," | |
+ "难过就躺在某一个人的怀里尽情的阐述心扉或者一个人一杯红酒一部电影在夜三级片 深人静的晚上,关上电话静静的发呆着。"; | |
System.out.println("待检测语句字数:" + string.length()); | |
long beginTime = System.currentTimeMillis(); | |
List<String> words = filter.search(string); | |
long endTime = System.currentTimeMillis(); | |
System.out.println("语句中包含敏感词的个数为:" + words.size() + "。包含:" + words); | |
System.out.println("总共消耗时间为:" + (endTime - beginTime)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
更新增加一个敏感词为空判断以及记录敏感词数。