基于DFA算法的敏感词查找工具类

/**
* Java实现DFA算法进行敏感词过滤
*/
public class SensitiveWordUtil {
private final TrieNode root;

public SensitiveWordUtil() {
root = new TrieNode();
}

/**
* 添加敏感词
*
* @param word 词
*/
public void addWord(String word) {
TrieNode node = root;
for (char ch : word.toCharArray()) {
node = node.getChildren().computeIfAbsent(ch, c -> new TrieNode());
}
node.setEndOfWord(true);
}

/**
* 是否在敏感词库中
*
* @param word 词
*/
public boolean containsWord(String word) {
TrieNode node = root;
for (char ch : word.toCharArray()) {
node = node.getChildren().get(ch);
if (node == null) {
return false;
}
}
return node.isEndOfWord();
}

/**
* 是否存在敏感词
*
* @param text 文本
*/
public boolean containsSensitiveWord(String text) {
for (int i = 0; i < text.length(); i++) {
if (containsWordStartingFromIndex(text, i)) {
return true;
}
}
return false;
}

private boolean containsWordStartingFromIndex(String text, int startIndex) {
TrieNode node = root;
for (int i = startIndex; i < text.length(); i++) {
char ch = text.charAt(i);
node = node.getChildren().get(ch);
if (node == null) {
return false;
}
if (node.isEndOfWord()) {
return true;
}
}
return false;
}

/**
* 将敏感词替换为*
*
* @param text 文本
*/
public String filterWords(String text) {
StringBuilder filteredText = new StringBuilder();
StringBuilder word = new StringBuilder();
TrieNode node = root;

for (int i = 0; i < text.length(); i++) {
char ch = text.charAt(i);
node = node.getChildren().get(ch);

if (node != null) {
word.append(ch);
if (node.isEndOfWord()) {
char[] chars = new char[word.length()];
Arrays.fill(chars, '*');
filteredText.append(new String(chars));
word.setLength(0);
node = root;
}
} else {
filteredText.append(text.charAt(i));
node = root;
word.setLength(0);
}
}

filteredText.append(word);
return filteredText.toString();
}

public List<String> findSensitiveWords(String text) {
List<String> sensitiveWords = new ArrayList<>();
for (int i = 0; i < text.length(); i++) {
int wordEndIndex = findWordEndIndex(text, i);
if (wordEndIndex != -1) {
String word = text.substring(i, wordEndIndex + 1);
if (containsWord(word)) {
sensitiveWords.add(word);
}
i = wordEndIndex;
}
}
return sensitiveWords;
}

private int findWordEndIndex(String text, int startIndex) {
TrieNode node = root;
int endIndex = -1;
for (int i = startIndex; i < text.length(); i++) {
char ch = text.charAt(i);
node = node.getChildren().get(ch);
if (node == null) {
break;
}
if (node.isEndOfWord()) {
endIndex = i;
}
}
return endIndex;
}

private static class TrieNode {
private final Map<Character, TrieNode> children;
private boolean endOfWord;

public TrieNode() {
children = new HashMap<>();
}

public Map<Character, TrieNode> getChildren() {
return children;
}

public boolean isEndOfWord() {
return endOfWord;
}

public void setEndOfWord(boolean endOfWord) {
this.endOfWord = endOfWord;
}
}

public static void main(String[] args) throws IOException {
SensitiveWordUtil filter = new SensitiveWordUtil();
filter.addWord("敏感词1");
filter.addWord("敏感词2");
filter.addWord("敏感词3");

String text = "这是一个包含敏感词1的测试文本,敏感词2也在其中。";

String filteredText = filter.filterWords(text);
System.out.println("修正后的内容:" + filteredText);

List<String> words = filter.findSensitiveWords(text);
System.out.println("包含以下敏感词:" + words);

double asDouble = Arrays.stream(new int[10000]).mapToLong(o -> {
long dfaStartTime = System.nanoTime();
filter.containsSensitiveWord(text);
return (System.nanoTime() - dfaStartTime);
}).average().getAsDouble();
System.out.println("DFA算法耗时:" + asDouble + "纳秒");

// 使用正则表达式查找敏感词
double asDouble1 = Arrays.stream(new int[10000]).mapToLong(o -> {
long regexStartTime = System.nanoTime();
boolean regexResult = text.matches(".*(敏感词1|敏感词2|敏感词3).*");
return (System.nanoTime() - regexStartTime);
}).average().getAsDouble();
System.out.println("正则表达式耗时:" + asDouble1 + "纳秒");

System.out.println(asDouble1 / asDouble);

}
}