public class SensitiveWordUtil { private final TrieNode root;
public SensitiveWordUtil() { root = new TrieNode(); }
public void addWord(String word) { TrieNode node = root; for (char ch : word.toCharArray()) { node = node.getChildren().computeIfAbsent(ch, c -> new TrieNode()); } node.setEndOfWord(true); }
public boolean containsWord(String word) { TrieNode node = root; for (char ch : word.toCharArray()) { node = node.getChildren().get(ch); if (node == null) { return false; } } return node.isEndOfWord(); }
public boolean containsSensitiveWord(String text) { for (int i = 0; i < text.length(); i++) { if (containsWordStartingFromIndex(text, i)) { return true; } } return false; }
private boolean containsWordStartingFromIndex(String text, int startIndex) { TrieNode node = root; for (int i = startIndex; i < text.length(); i++) { char ch = text.charAt(i); node = node.getChildren().get(ch); if (node == null) { return false; } if (node.isEndOfWord()) { return true; } } return false; }
public String filterWords(String text) { StringBuilder filteredText = new StringBuilder(); StringBuilder word = new StringBuilder(); TrieNode node = root;
for (int i = 0; i < text.length(); i++) { char ch = text.charAt(i); node = node.getChildren().get(ch);
if (node != null) { word.append(ch); if (node.isEndOfWord()) { char[] chars = new char[word.length()]; Arrays.fill(chars, '*'); filteredText.append(new String(chars)); word.setLength(0); node = root; } } else { filteredText.append(text.charAt(i)); node = root; word.setLength(0); } }
filteredText.append(word); return filteredText.toString(); }
public List<String> findSensitiveWords(String text) { List<String> sensitiveWords = new ArrayList<>(); for (int i = 0; i < text.length(); i++) { int wordEndIndex = findWordEndIndex(text, i); if (wordEndIndex != -1) { String word = text.substring(i, wordEndIndex + 1); if (containsWord(word)) { sensitiveWords.add(word); } i = wordEndIndex; } } return sensitiveWords; }
private int findWordEndIndex(String text, int startIndex) { TrieNode node = root; int endIndex = -1; for (int i = startIndex; i < text.length(); i++) { char ch = text.charAt(i); node = node.getChildren().get(ch); if (node == null) { break; } if (node.isEndOfWord()) { endIndex = i; } } return endIndex; }
private static class TrieNode { private final Map<Character, TrieNode> children; private boolean endOfWord;
public TrieNode() { children = new HashMap<>(); }
public Map<Character, TrieNode> getChildren() { return children; }
public boolean isEndOfWord() { return endOfWord; }
public void setEndOfWord(boolean endOfWord) { this.endOfWord = endOfWord; } }
public static void main(String[] args) throws IOException { SensitiveWordUtil filter = new SensitiveWordUtil(); filter.addWord("敏感词1"); filter.addWord("敏感词2"); filter.addWord("敏感词3");
String text = "这是一个包含敏感词1的测试文本,敏感词2也在其中。";
String filteredText = filter.filterWords(text); System.out.println("修正后的内容:" + filteredText);
List<String> words = filter.findSensitiveWords(text); System.out.println("包含以下敏感词:" + words);
double asDouble = Arrays.stream(new int[10000]).mapToLong(o -> { long dfaStartTime = System.nanoTime(); filter.containsSensitiveWord(text); return (System.nanoTime() - dfaStartTime); }).average().getAsDouble(); System.out.println("DFA算法耗时:" + asDouble + "纳秒");
double asDouble1 = Arrays.stream(new int[10000]).mapToLong(o -> { long regexStartTime = System.nanoTime(); boolean regexResult = text.matches(".*(敏感词1|敏感词2|敏感词3).*"); return (System.nanoTime() - regexStartTime); }).average().getAsDouble(); System.out.println("正则表达式耗时:" + asDouble1 + "纳秒");
System.out.println(asDouble1 / asDouble);
} }
|