敏感词校验
場景:敏感詞存在數據庫中,也緩存到redis中,判斷評論信息是否包含敏感詞。
方案一:用字符串charAt對比。
方案二:DFA算法
由于需求是只要存在敏感詞就直接返回是否包含敏感詞結果。所以用charAt方法速度快,占用內存小,所以方案一比較適合。
方案二適合,找到包含的所有敏感詞,然后將敏感詞輸出,并替代。效率高。但是敏感詞數量與所占內存成正比。
方案一代碼:
思路:其實就是一段文字字符串,是否包含敏感詞。
/*** * Desc:(敏感詞校驗)container是否包含child* @author wangdeqiu* @date 2018年10月18日 下午4:28:41* @param text :要判斷的文字信息* @return childWord:敏感詞*/public static boolean str1ContainsStr2(String text, String childWord) {if (text.length() < childWord.length()) {return false;}int maxLength = text.length();char first = childWord.charAt(0);int i = -1;while (++i < maxLength) {// 第一個是否能找到if (first == text.charAt(i)) {// 如果找到匹配之后的元素if (mathStr(text.substring(i), childWord)) {return true;}}}return false;}private static boolean mathStr(String max, String min) {if (max.length() < min.length()) {return false;}int i = -1;int minLength = min.length();while (++i < minLength) {if (min.charAt(i) != max.charAt(i)) {return false;}}return true;}方案二代碼:
1.敏感詞庫初始化(將敏感詞用DFA算法的原理封裝到敏感詞庫中,敏感詞庫采用HashMap保存),代碼如下:
package com.haidaipuhui.service.SensitiveWord;import java.util.*;/*** @author wangdeqiu* @date 2018/11/6* @Dsecription: 敏感詞庫初始化*/ @SuppressWarnings("rawtypes") public class SensitiveWordInit {//敏感詞庫public Map sensitiveWordMap;/***@describe: 初始化敏感詞*@author wangdeqiu*@date 2018/11/6 下午1:46*@param sensitiveWords:redis/數據庫中獲取的敏感詞**/public Map initKeyWord(List<String> sensitiveWords){try{// 從敏感詞集合對象中取出敏感詞并封裝到Set集合中Set<String> keyWordSet = new HashSet<String>();for(String s:sensitiveWords){keyWordSet.add(s.trim());}//將詞庫放到hashmap中addSensitiveWordToHashMap(keyWordSet);}catch(Exception e){e.printStackTrace();}return sensitiveWordMap;}/***@describe: 封裝敏感詞庫*@author wangdeqiu*@date 2018/11/6 下午1:47*@param keyWordSet :初始后的敏感詞**/@SuppressWarnings("unchecked")public void addSensitiveWordToHashMap(Set<String> keyWordSet){// 初始化HashMap對象并控制容器的大小sensitiveWordMap = new HashMap(keyWordSet.size());// 敏感詞String key = null;// 用來按照相應的格式保存敏感詞庫數據Map nowMap = null;// 用來輔助構建敏感詞庫Map<String,String> newWorMap = null;// 使用一個迭代器來循環敏感詞集合Iterator<String> iterator = keyWordSet.iterator();while (iterator.hasNext()){key = iterator.next();// 等于敏感詞庫,HashMap對象在內存中占用的是同一個地址,所以此nowMap對象的變化,sensitiveWordMap對象也會跟著改變nowMap = sensitiveWordMap;for(int i=0;i<key.length();i++){// 截取敏感詞當中的字,在敏感詞庫中字為HashMap對象的Key鍵值char keyChar = key.charAt(i);// 判斷這個字是否存在于敏感詞庫中Object wordMap = nowMap.get(keyChar);if (wordMap != null){nowMap = (Map) wordMap;}else{newWorMap = new HashMap<String, String>();newWorMap.put("isEnd", "0");nowMap.put(keyChar, newWorMap);nowMap = newWorMap;}// 如果該字是當前敏感詞的最后一個字,則標識為結尾字if (i == key.length() - 1){nowMap.put("isEnd", "1");}System.out.println("封裝敏感詞庫過程:"+sensitiveWordMap);}System.out.println("查看敏感詞庫數據:" + sensitiveWordMap);}}}2.寫一個敏感詞過濾工具類,里面可以寫上自己需要的方法,代碼如下
package com.haidaipuhui.service.SensitiveWord;import java.util.*;import org.springframework.stereotype.Component;import com.haidaipuhui.util.SystemUtil;/*** @author wangdeqiu* @date 2018/11/6* @Dsecription: 敏感詞過濾類*/ @SuppressWarnings("rawtypes") @Component public class SensitivewordTool {//敏感詞庫public static Map sensitiveWordMap;//只過濾最小敏感詞 //最小匹配規則,如:敏感詞庫["中國","中國人"],語句:"我是中國人",匹配結果:我是[中國]人public static int minMatchTYpe = 1;//過濾所有敏感詞 //最大匹配規則,如:敏感詞庫["中國","中國人"],語句:"我是中國人",匹配結果:我是[中國人]public static int maxMatchTYpe = 2;/***@describe: 敏感詞庫敏感詞的數量*@author wangdeqiu*@date 2018/11/6 下午2:02*@param**/public static int getWordSize(){if(SensitivewordTool.sensitiveWordMap==null){return 0;}return SensitivewordTool.sensitiveWordMap.size();}/***@describe: 是否包含敏感詞*@author wangdeqiu*@date 2018/11/6 下午2:04*@param * */ public static boolean isContaintSensitiveWord(String txt,int matchType){boolean flag = false;for(int i=0;i<txt.length();i++){int matchFlag = checkSensitiveWord(txt, i, matchType);if(matchFlag>0){return true;}}return flag;}/***@describe: 獲取敏感詞內容*@author wangdeqiu*@date 2018/11/6 下午2:07*@param**/public static Set<String> getSensitiveWord(String txt,int matchType){Set<String> sensitiveWordList = new HashSet<String>();for (int i = 0; i < txt.length(); i++){int length = checkSensitiveWord(txt, i, matchType);if (length > 0){// 將檢測出的敏感詞保存到集合中sensitiveWordList.add(txt.substring(i, i + length));i = i + length - 1;}}return sensitiveWordList;}/***@describe: 替換敏感詞*@author wangdeqiu*@date 2018/11/6 下午2:37*@param**/public static String replaceSensitiveWord(String txt, int matchType, String replaceChar){String resultTxt = txt;Set<String> set = getSensitiveWord(txt, matchType);Iterator<String> iterator = set.iterator();String word = null;String replaceString = null;while (iterator.hasNext()){word = iterator.next();replaceString = getReplaceChars(replaceChar, word.length());resultTxt = resultTxt.replaceAll(word, replaceString);}return resultTxt;}/***@describe: 替換敏感詞內容*@author wangdeqiu*@date 2018/11/6 下午2:11*@param**/private static String getReplaceChars(String replaceChar, int length){String resultReplace = replaceChar;for (int i = 1; i < length; i++){resultReplace += replaceChar;}return resultReplace;}/***@describe: 檢查敏感詞數量*@author wangdeqiu*@date 2018/11/6 下午2:11*@param**/public static int checkSensitiveWord(String txt,int beginIndex,int matchType){boolean flag = false;// 記錄敏感詞數量int matchFlag = 0;char word = 0;Map nowMap = SensitivewordTool.sensitiveWordMap;for (int i = beginIndex; i < txt.length(); i++){word = txt.charAt(i);// 判斷該字是否存在于敏感詞庫中nowMap = (Map) nowMap.get(word);if (nowMap != null){matchFlag++;// 判斷是否是敏感詞的結尾字,如果是結尾字則判斷是否繼續檢測if ("1".equals(nowMap.get("isEnd"))){flag = true;// 判斷過濾類型,如果是小過濾則跳出循環,否則繼續循環if (SensitivewordTool.minMatchTYpe == matchType){break;}}}else{break;}}if (!flag){matchFlag = 0;}return matchFlag;}/*** * Desc:敏感詞過濾* @author wangdeqiu* @date 2018年11月7日 下午2:24:25* @param* @return*/public static Set<String> sensitiveWordFiltering(String text){// 初始化敏感詞庫對象SensitiveWordInit sensitiveWordInit = new SensitiveWordInit();// 從數據庫中獲取敏感詞對象集合(調用的方法來自Dao層,此方法是service層的實現類)List<String> sensitiveWords =SystemUtil.getWordToRedis();// 構建敏感詞庫Map sensitiveWordMap = sensitiveWordInit.initKeyWord(sensitiveWords);// 傳入SensitivewordEngine類中的敏感詞庫SensitivewordTool.sensitiveWordMap = sensitiveWordMap;// 得到敏感詞有哪些,傳入2表示獲取所有敏感詞Set<String> set = SensitivewordTool.getSensitiveWord(text, 2);return set;}/*** * Desc:判斷是否存在敏感詞* @author wangdeqiu* @date 2018年11月7日 下午2:24:38* @param* @return*/public static boolean isExitSensitiveWord(String text){// 初始化敏感詞庫對象SensitiveWordInit sensitiveWordInit = new SensitiveWordInit();// 從數據庫中獲取敏感詞對象集合(調用的方法來自Dao層,此方法是service層的實現類)List<String> sensitiveWords =SystemUtil.getWordToRedis();// 構建敏感詞庫Map sensitiveWordMap = sensitiveWordInit.initKeyWord(sensitiveWords);// 傳入SensitivewordEngine類中的敏感詞庫SensitivewordTool.sensitiveWordMap = sensitiveWordMap;// 得到敏感詞有哪些,傳入1表示獲取所有敏感詞boolean isExit= SensitivewordTool.isContaintSensitiveWord(text, 1);return isExit;}}參考:https://www.jb51.net/article/116752.htm
總結
- 上一篇: SGU495 Kids and Priz
- 下一篇: 面向对象之:继承