网站关键词过滤屏蔽策略

蝈蝈大侠

发布于 2021-11-20 02:08:18

背景：
网站官方审查时，涉及到敏感、暴恐、广告、涉黄等不良关键词检索过滤。

代码（未优化版本）：

/**
* 优化后的关键词屏蔽类
* @param { Array } matchKeywords 屏蔽关键词数据
* @param { Array } excludeKeywords 排除关键词，比如包含关系
* @param { Object } options 配置
**/ 
 class KeywordFilter {
      constructor(matchKeywords, excludeKeywords, options = {}) {
          // 参数验证
          if (!Array.isArray(matchKeywords)) {
              throw new Error('matchKeywords must be an array');
          }
          
          this.matchKeywords = this.prepareKeywords(matchKeywords);
          this.excludeKeywords = this.prepareKeywords(excludeKeywords || []);
          this.options = {
              caseSensitive: false,
              useRegExp: true,
              ...options
          };
          
          // 预编译正则表达式
          this.prepareRegExps();
          
          // 性能监控
          this.performance = {
              startTime: 0,
              endTime: 0,
              nodesProcessed: 0,
              replacements: 0
          };
      }
      
      // 准备关键词（排序和去重）
      prepareKeywords(keywords) {
          return [...new Set(keywords)]
              .filter(kw => kw && kw.trim())
              .sort((a, b) => b.length - a.length);
      }
      
      // 预编译正则表达式
      prepareRegExps() {
          if (this.matchKeywords.length === 0) return;
          
          // 创建匹配正则
          const pattern = this.matchKeywords
              .map(kw => this.escapeRegExp(kw))
              .join('|');
          
          this.matchRegExp = new RegExp(pattern, this.options.caseSensitive ? 'g' : 'gi');
          
          // 创建排除正则（如果有排除词）
          if (this.excludeKeywords.length > 0) {
              const excludePattern = this.excludeKeywords
                  .map(kw => this.escapeRegExp(kw))
                  .join('|');
              
              this.excludeRegExp = new RegExp(excludePattern, this.options.caseSensitive ? 'g' : 'gi');
          }
      }
      
      // 转义正则特殊字符
      escapeRegExp(string) {
          return string.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
      }
      
      // 执行过滤
      filter() {
          this.performance.startTime = performance.now();
          this.performance.nodesProcessed = 0;
          this.performance.replacements = 0;
          
          try {
              this.processNode(document.body);
              
              this.performance.endTime = performance.now();
              this.logPerformance();
              
              return {
                  success: true,
                  nodesProcessed: this.performance.nodesProcessed,
                  replacements: this.performance.replacements,
                  time: this.performance.endTime - this.performance.startTime
              };
          } catch (error) {
              console.error('Keyword filtering error:', error);
              return {
                  success: false,
                  error: error.message
              };
          }
      }
      
      // 处理节点
      processNode(rootNode) {
          // 使用TreeWalker高效遍历文本节点
          const treeWalker = document.createTreeWalker(
              rootNode,
              NodeFilter.SHOW_TEXT,
              {
                  acceptNode: function(node) {
                      // 跳过脚本、样式等元素内的文本
                      if (node.parentNode.nodeName === 'SCRIPT' || 
                          node.parentNode.nodeName === 'STYLE' ||
                          node.parentNode.nodeName === 'IFRAME') {
                          return NodeFilter.FILTER_REJECT;
                      }
                      
                      // 跳过空文本或过短文本
                      if (!node.textContent || node.textContent.trim().length < 2) {
                          return NodeFilter.FILTER_REJECT;
                      }
                      
                      return NodeFilter.FILTER_ACCEPT;
                  }
              },
              false
          );
          
          let currentNode;
          const nodesToProcess = [];
          
          // 首先收集所有需要处理的节点
          while (currentNode = treeWalker.nextNode()) {
              nodesToProcess.push(currentNode);
          }
          
          // 然后处理这些节点
          nodesToProcess.forEach(node => {
              this.processTextNode(node);
              this.performance.nodesProcessed++;
          });
      }
      
      // 处理文本节点
      processTextNode(textNode) {
          let text = textNode.textContent;
          let modified = false;
          
          // 使用预编译的正则进行匹配
          let match;
          while ((match = this.matchRegExp.exec(text)) !== null) {
              const matchedText = match[0];
              
              // 检查是否在排除列表中
              if (this.shouldExclude(matchedText, text, match.index)) {
                  continue;
              }
              
              // 执行替换
              text = text.substring(0, match.index) + 
                     '[已屏蔽]' + 
                     text.substring(match.index + matchedText.length);
              
              // 重置正则表达式lastIndex，因为字符串长度已改变
              this.matchRegExp.lastIndex = match.index + 3; // "[已屏蔽]"长度为4，但替换文本更短
              
              modified = true;
              this.performance.replacements++;
          }
          
          if (modified) {
              textNode.textContent = text;
          }
      }
      
      // 检查是否应该排除当前匹配
      shouldExclude(matchedText, fullText, matchIndex) {
          if (!this.excludeRegExp) return false;
          
          // 检查匹配周围的上下文（前后各2个字符）
          const contextStart = Math.max(0, matchIndex - 2);
          const contextEnd = Math.min(fullText.length, matchIndex + matchedText.length + 2);
          const context = fullText.substring(contextStart, contextEnd);
          
          return this.excludeRegExp.test(context);
      }
      
      // 记录性能信息
      logPerformance() {
          const time = this.performance.endTime - this.performance.startTime;
          console.log(`关键词过滤完成:
          - 处理节点: ${this.performance.nodesProcessed}
          - 替换次数: ${this.performance.replacements}
          - 耗时: ${time.toFixed(2)}ms`);
      }
  }

示例用法：

// 示例用法
document.addEventListener('DOMContentLoaded', function() {
            const matchKeywords = ["首", "NO.1", "100%", "百度"];
            const excludeKeywords = ["首页", "%"];
            
            // 创建过滤器实例
            const filter = new KeywordFilter(matchKeywords, excludeKeywords);
            
            // 执行过滤
            const result = filter.filter();
            
            // 性能测试按钮事件
            document.getElementById('runTest').addEventListener('click', function() {
                // 模拟大量文本进行性能测试
                const testContent = document.createElement('div');
                testContent.innerHTML = `
                    <p>欢迎来到首页，我们提供NO.1的服务，100%的满意度保证。通过百度搜索可以找到我们。</p>
                    <p>这是首段测试文本，包含多个关键词如NO.1和100%等。</p>
                    <p>百度是中国最大的搜索引擎，首页设计简洁大方。</p>
                    <p>我们的产品质量达到100%合格率，服务排名NO.1。</p>
                `.repeat(50); // 重复50次以增加测试数据量
                
                document.body.appendChild(testContent);
                
                // 执行性能测试
                const startTime = performance.now();
                const testFilter = new KeywordFilter(matchKeywords, excludeKeywords);
                const testResult = testFilter.filter();
                const endTime = performance.now();
                
                alert(`性能测试完成!\n处理节点: ${testResult.nodesProcessed}\n替换次数: ${testResult.replacements}\n耗时: ${(endTime - startTime).toFixed(2)}ms`);
                
                // 清理测试内容
                document.body.removeChild(testContent);
            });
        });

非特殊说明，本站 idealfrog.cn 上的文章均由本站作者原创，原作品版权归属原作者，转载请联系 @文章作者授权。转载时请在文首注明，来源 idealfrog.cn 及教程作者，并附本文链接。谢谢各位编辑同仁配合。

下一篇>

没有下一篇咯~

发表评论~~点击登录~~ ，秀出你的神评！

暂无更多评论咯，快留下你的真知灼见吧！去参与评论

・蜀ICP备19018981号

查询 5 次，耗时 0.3348 秒