文章目录
-
- 在当今信息爆炸的互联网环境中,内容创作已成为网站运营的核心。然而,随着内容数量的急剧增加,抄袭和内容重复问题也日益严重。对于WordPress网站管理员和内容创作者而言,保护原创内容不仅是维护品牌声誉的需要,更是提升搜索引擎排名、吸引忠实读者的关键。 传统的防抄袭方法主要依赖人工检查或基础文本比对,效率低下且难以应对海量内容。本教程将指导您通过WordPress代码二次开发,集成智能化的内容相似度检测与防抄袭系统,将您的网站提升到一个新的智能化水平。
-
- 在开始开发之前,我们需要明确系统应具备的核心功能: 实时内容检测:在文章发布时自动检测内容相似度 批量历史内容扫描:对网站现有内容进行全面检查 智能相似度算法:采用先进的文本相似度计算方法 外部资源比对:能够与互联网上的公开内容进行比对 可视化报告系统:直观展示检测结果和相似度分析 自动化处理机制:根据预设规则自动处理疑似抄袭内容
- 我们将采用以下技术方案: WordPress钩子机制:利用save_post、publish_post等动作钩子实现自动化检测 PHP文本处理库:使用PHP内置函数和扩展进行文本预处理 相似度算法:实现余弦相似度、Jaccard相似系数和编辑距离算法 外部API集成:通过第三方原创检测API增强检测能力 数据库优化:合理设计数据表结构,确保系统性能 前端展示:使用AJAX和Chart.js实现交互式报告界面
- 用户发布内容 → WordPress钩子触发 → 文本预处理 → 特征提取 → 相似度计算 → 结果评估 → 数据库存储 → 报告生成 → 用户通知
-
- 首先,确保您的开发环境满足以下要求: WordPress 5.0或更高版本 PHP 7.3或更高版本(支持mbstring、curl扩展) MySQL 5.6或更高版本 至少100MB的可用磁盘空间(用于存储文本指纹和缓存)
- 我们将创建一个独立的WordPress插件来实现所有功能: 在wp-content/plugins/目录下创建新文件夹smart-content-checker 创建主插件文件smart-content-checker.php: <?php /** * Plugin Name: 智能内容相似度检测与防抄袭系统 * Plugin URI: https://yourwebsite.com/ * Description: 为WordPress网站提供智能化的内容相似度检测与防抄袭功能 * Version: 1.0.0 * Author: 您的名称 * License: GPL v2 or later * Text Domain: smart-content-checker */ // 防止直接访问 if (!defined('ABSPATH')) { exit; } // 定义插件常量 define('SCC_VERSION', '1.0.0'); define('SCC_PLUGIN_DIR', plugin_dir_path(__FILE__)); define('SCC_PLUGIN_URL', plugin_dir_url(__FILE__)); define('SCC_CACHE_TIME', 3600); // 缓存时间1小时 // 初始化插件 require_once SCC_PLUGIN_DIR . 'includes/class-core.php'; require_once SCC_PLUGIN_DIR . 'includes/class-text-processor.php'; require_once SCC_PLUGIN_DIR . 'includes/class-similarity-checker.php'; require_once SCC_PLUGIN_DIR . 'includes/class-database.php'; require_once SCC_PLUGIN_DIR . 'includes/class-admin-interface.php'; // 启动插件 function scc_init_plugin() { $core = new SCC_Core(); $core->init(); } add_action('plugins_loaded', 'scc_init_plugin');
-
- 创建includes/class-text-processor.php文件: <?php class SCC_Text_Processor { /** * 文本清洗和标准化 */ public function clean_text($text) { // 移除HTML标签 $text = strip_tags($text); // 转换所有字符为小写 $text = mb_strtolower($text, 'UTF-8'); // 移除特殊字符和标点符号,保留中文、英文和数字 $text = preg_replace('/[^p{L}p{N}s]/u', ' ', $text); // 移除多余空格 $text = preg_replace('/s+/', ' ', $text); return trim($text); } /** * 中文文本分词 * 注意:需要服务器安装中文分词扩展,这里提供简单实现 */ public function chinese_segmentation($text) { // 如果服务器安装了scws或jieba分词,可以调用相关函数 // 这里提供一个简单的按字符分割的方法(适用于基础需求) if (function_exists('scws_new')) { // 使用scws分词 $so = scws_new(); $so->set_charset('utf8'); $so->send_text($text); $words = array(); while ($tmp = $so->get_result()) { foreach ($tmp as $word) { if (strlen($word['word']) > 1) { $words[] = $word['word']; } } } $so->close(); return $words; } else { // 简单分词:按空格和标点分割 return preg_split('/s+/', $text); } } /** * 提取文本特征(词频向量) */ public function extract_features($text, $max_features = 100) { $cleaned_text = $this->clean_text($text); // 分词 $words = $this->chinese_segmentation($cleaned_text); // 计算词频 $word_freq = array_count_values($words); // 移除停用词 $word_freq = $this->remove_stopwords($word_freq); // 按词频排序并取前N个特征 arsort($word_freq); $features = array_slice($word_freq, 0, $max_features, true); return $features; } /** * 移除停用词 */ private function remove_stopwords($word_freq) { // 中文停用词列表(部分示例) $stopwords = array( '的', '了', '在', '是', '我', '有', '和', '就', '不', '人', '都', '一', '一个', '上', '也', '很', '到', '说', '要', '去', '你', '会', '着', '没有', '看', '好', '自己', '这', '那', '他', '她', '它' ); // 英文停用词 $english_stopwords = array( 'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing' ); $all_stopwords = array_merge($stopwords, $english_stopwords); foreach ($all_stopwords as $stopword) { if (isset($word_freq[$stopword])) { unset($word_freq[$stopword]); } } return $word_freq; } /** * 生成文本指纹(Simhash算法简化版) */ public function generate_simhash($text) { $features = $this->extract_features($text, 64); $vector = array_fill(0, 64, 0); foreach ($features as $word => $weight) { $hash = crc32($word); for ($i = 0; $i < 64; $i++) { $bit = ($hash >> $i) & 1; if ($bit == 1) { $vector[$i] += $weight; } else { $vector[$i] -= $weight; } } } // 生成64位指纹 $fingerprint = 0; for ($i = 0; $i < 64; $i++) { if ($vector[$i] > 0) { $fingerprint |= (1 << $i); } } return $fingerprint; } }
- 创建includes/class-similarity-checker.php文件: <?php class SCC_Similarity_Checker { private $text_processor; public function __construct() { $this->text_processor = new SCC_Text_Processor(); } /** * 计算余弦相似度 */ public function cosine_similarity($text1, $text2) { $features1 = $this->text_processor->extract_features($text1); $features2 = $this->text_processor->extract_features($text2); // 获取所有特征的并集 $all_features = array_unique(array_merge( array_keys($features1), array_keys($features2) )); // 创建向量 $vector1 = array(); $vector2 = array(); foreach ($all_features as $feature) { $vector1[] = isset($features1[$feature]) ? $features1[$feature] : 0; $vector2[] = isset($features2[$feature]) ? $features2[$feature] : 0; } // 计算点积 $dot_product = 0; for ($i = 0; $i < count($vector1); $i++) { $dot_product += $vector1[$i] * $vector2[$i]; } // 计算模长 $magnitude1 = sqrt(array_sum(array_map(function($x) { return $x * $x; }, $vector1))); $magnitude2 = sqrt(array_sum(array_map(function($x) { return $x * $x; }, $vector2))); // 避免除以零 if ($magnitude1 == 0 || $magnitude2 == 0) { return 0; } return $dot_product / ($magnitude1 * $magnitude2); } /** * 计算Jaccard相似系数 */ public function jaccard_similarity($text1, $text2) { $features1 = $this->text_processor->extract_features($text1); $features2 = $this->text_processor->extract_features($text2); $set1 = array_keys($features1); $set2 = array_keys($features2); $intersection = array_intersect($set1, $set2); $union = array_unique(array_merge($set1, $set2)); if (count($union) == 0) { return 0; } return count($intersection) / count($union); } /** * 计算Simhash海明距离 */ public function simhash_distance($text1, $text2) { $hash1 = $this->text_processor->generate_simhash($text1); $hash2 = $this->text_processor->generate_simhash($text2); // 计算海明距离 $xor = $hash1 ^ $hash2; $distance = 0; while ($xor) { $distance += $xor & 1; $xor >>= 1; } return $distance; } /** * 综合相似度评估 */ public function comprehensive_similarity($text1, $text2) { $cosine = $this->cosine_similarity($text1, $text2); $jaccard = $this->jaccard_similarity($text1, $text2); $simhash_distance = $this->simhash_distance($text1, $text2); // 将Simhash距离转换为相似度(距离越小,相似度越高) $simhash_similarity = max(0, 1 - ($simhash_distance / 64)); // 加权平均 $weights = array( 'cosine' => 0.5, 'jaccard' => 0.3, 'simhash' => 0.2 ); $similarity = ($cosine * $weights['cosine']) + ($jaccard * $weights['jaccard']) + ($simhash_similarity * $weights['simhash']); return round($similarity, 4); } /** * 与外部API集成进行深度检测 */ public function external_api_check($text, $api_type = 'copyscape') { // 这里以Copyscape API为例 $api_key = get_option('scc_copyscape_api_key', ''); if (empty($api_key)) { return array( 'success' => false, 'message' => 'API密钥未配置' ); } $encoded_text = urlencode($text); $url = "https://www.copyscape.com/api/?o=search&k={$api_key}&t={$encoded_text}&f=xml"; $response = wp_remote_get($url, array( 'timeout' => 30, 'sslverify' => false )); if (is_wp_error($response)) { return array( 'success' => false, 'message' => $response->get_error_message() ); } $body = wp_remote_retrieve_body($response); // 解析XML响应 $xml = simplexml_load_string($body); if (!$xml) { return array( 'success' => false, 'message' => 'API响应解析失败' ); } $results = array(); if (isset($xml->result)) { foreach ($xml->result as $result) { $results[] = array( 'url' => (string)$result->url, 'title' => (string)$result->title, 'similarity' => (float)$result->minwordsmatched / 100 ); } } return array( 'success' => true, 'results' => $results ); } }
-
- 创建includes/class-core.php文件: <?php class SCC_Core { private $db; private $text_processor; private $similarity_checker; public function __construct() { $this->db = new SCC_Database(); $this->text_processor = new SCC_Text_Processor(); $this->similarity_checker = new SCC_Similarity_Checker(); } /** * 初始化插件 */ public function init() { // 创建数据库表 register_activation_hook(__FILE__, array($this->db, 'create_tables')); // 添加WordPress钩子 add_action('save_post', array($this, 'on_post_save'), 10, 3); add_action('publish_post', array($this, 'on_post_publish'), 10, 2); // 添加管理菜单 add_action('admin_menu', array($this, 'add_admin_menu')); // 添加AJAX处理 add_action('wp_ajax_scc_manual_scan', array($this, 'ajax_manual_scan')); add_action('wp_ajax_scc_bulk_scan', array($this, 'ajax_bulk_scan')); // 添加文章列表列 add_filter('manage_posts_columns', array($this, 'add_post_columns')); add_action('manage_posts_custom_column', array($this, 'render_post_columns'), 10, 2); // 添加文章编辑页面元框 add_action('add_meta_boxes', array($this, 'add_meta_boxes')); } /** * 文章保存时触发 */ public function on_post_save($post_id, $post, $update) { // 跳过自动保存和修订 if (defined('DOING_AUTOSAVE') && DOING_AUTOSAVE) { return; } if (wp_is_post_revision($post_id)) { return; } // 只处理特定文章类型 $allowed_types = array('post', 'page'); if (!in_array($post->post_type, $allowed_types)) { return; } // 获取文章内容 $content = $post->post_content; // 生成内容哈希 $content_hash = md5($content); // 生成指纹 $fingerprint_64 = $this->text_processor->generate_simhash($content); // 保存指纹 $this->db->save_fingerprint($post_id, $fingerprint_64, $content_hash); // 如果是更新操作,检查与历史版本的相似度 if ($update) { $this->check_self_similarity($post_id, $content); } } /** * 文章发布时触发 */ public function on_post_publish($post_id, $post) { // 执行相似度检测 $this->perform_similarity_check($post_id, $post->post_content); } /** * 执行相似度检测 */ private function perform_similarity_check($post_id, $content) { // 1. 内部相似度检测 $fingerprint_64 = $this->text_processor->generate_simhash($content); $similar_posts = $this->db->find_similar_content($fingerprint_64, 5, $post_id); $internal_similarity = 0; $matched_posts = array(); if (!empty($similar_posts)) { foreach ($similar_posts as $similar_post) { $similar_post_content = get_post_field('post_content', $similar_post['post_id']); $similarity = $this->similarity_checker->comprehensive_similarity($content, $similar_post_content); if ($similarity > $internal_similarity) { $internal_similarity = $similarity; } if ($similarity > 0.3) { $matched_posts[] = array( 'post_id' => $similar_post['post_id'], 'title' => $similar_post['post_title'], 'similarity' => $similarity, 'url' => get_permalink($similar_post['post_id']) ); } } } // 2. 外部API检测(可选) $external_results = array(); if (get_option('scc_enable_external_check', false)) { $external_check = $this->similarity_checker->external_api_check($content); if ($external_check['success']) { $external_results = $external_check['results']; } } // 计算综合相似度 $total_similarity = $internal_similarity; if (!empty($external_results)) { $external_similarity = max(array_column($external_results, 'similarity')); $total_similarity = max($total_similarity, $external_similarity); } // 保存结果 $this->db->save_scan_result( $post_id, 'auto', $total_similarity, array_merge($matched_posts, $external_results), array( 'internal_similarity' => $internal_similarity, 'matched_posts' => $matched_posts, 'external_results' => $external_results ) ); // 发送通知 if ($total_similarity > get_option('scc_notification_threshold', 0.7)) { $this->send_notification($post_id, $total_similarity); } return $total_similarity; } /** * 检查与历史版本的相似度 */ private function check_self_similarity($post_id, $current_content) { $revisions = wp_get_post_revisions($post_id); if (empty($revisions)) { return; } // 获取最新修订版 $latest_revision = reset($revisions); $revision_content = $latest_revision->post_content; $similarity = $this->similarity_checker->comprehensive_similarity($current_content, $revision_content); // 如果相似度低于阈值,记录重大修改 if ($similarity < 0.5) { $this->db->save_scan_result( $post_id, 'revision_check', $similarity, array(), array( 'message' => '检测到文章内容发生重大修改', 'revision_id' => $latest_revision->ID, 'similarity_with_revision' => $similarity ) ); } } /** * 发送通知 */ private function send_notification($post_id, $similarity) { $post = get_post($post_id); $author = get_userdata($post->post_author); $admin_email = get_option('admin_email'); $subject = sprintf('【内容相似度警报】文章 "%s" 检测到高相似度内容', $post->post_title); $message = sprintf( "文章标题:%sn" . "文章ID:%dn" . "作者:%sn" . "检测相似度:%.2f%%n" . "文章链接:%sn" . "编辑链接:%snn" . "请及时审核该文章内容。", $post->post_title, $post_id, $author->display_name, $similarity * 100, get_permalink($post_id), admin_url('post.php?post=' . $post_id . '&action=edit') ); // 发送给管理员 wp_mail($admin_email, $subject, $message); // 如果设置了作者通知,也发送给作者 if (get_option('scc_notify_author', false)) { wp_mail($author->user_email, $subject, $message); } } }
- 创建includes/class-admin-interface.php文件: <?php class SCC_Admin_Interface { private $db; public function __construct() { $this->db = new SCC_Database(); } /** * 添加管理菜单 */ public function add_admin_menu() { // 主菜单 add_menu_page( '内容相似度检测', '内容检测', 'manage_options', 'scc-dashboard', array($this, 'render_dashboard_page'), 'dashicons-search', 30 ); // 子菜单 add_submenu_page( 'scc-dashboard', '批量检测', '批量检测', 'manage_options', 'scc-bulk-scan', array($this, 'render_bulk_scan_page') ); add_submenu_page( 'scc-dashboard', '检测设置', '设置', 'manage_options', 'scc-settings', array($this, 'render_settings_page') ); add_submenu_page( 'scc-dashboard', '检测报告', '报告统计', 'manage_options', 'scc-reports', array($this, 'render_reports_page') ); } /** * 渲染仪表盘页面 */ public function render_dashboard_page() { ?> <div class="wrap scc-dashboard"> <h1><?php echo esc_html(get_admin_page_title()); ?></h1> <div class="scc-stats-container"> <div class="scc-stat-card"> <h3>今日检测</h3> <div class="stat-number"><?php echo $this->get_today_scan_count(); ?></div> </div> <div class="scc-stat-card"> <h3>高风险内容</h3> <div class="stat-number" style="color: #dc3232;"><?php echo $this->get_high_risk_count(); ?></div> </div> <div class="scc-stat-card"> <h3>平均相似度</h3> <div class="stat-number"><?php echo $this->get_average_similarity(); ?>%</div> </div> <div class="scc-stat-card"> <h3>已保护文章</h3> <div class="stat-number"><?php echo $this->get_protected_post_count(); ?></div> </div> </div> <div class="scc-quick-actions"> <h2>快速操作</h2> <button class="button button-primary" onclick="sccQuickScan()">快速扫描最新文章</button> <button class="button" onclick="window.location.href='?page=scc-bulk-scan'">批量检测</button> <button class="button" onclick="window.location.href='?page=scc-reports'">查看完整报告</button> </div> <div class="scc-recent-scans"> <h2>最近检测记录</h2> <?php $this->render_recent_scans_table(); ?> </div> <script> function sccQuickScan() { jQuery.post(ajaxurl, { action: 'scc_manual_scan', post_ids: 'recent', nonce: '<?php echo wp_create_nonce('scc_manual_scan'); ?>' }, function(response) { if (response.success) { alert('扫描完成!检测到 ' + response.data.high_risk + ' 篇高风险文章'); location.reload(); } else { alert('扫描失败:' + response.data); } }); } </script> <style> .scc-stats-container { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin: 20px 0; } .scc-stat-card { background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); text-align: center; } .scc-stat-card h3 { margin-top: 0; color: #666; } .scc-stat-card .stat-number { font-size: 2.5em; font-weight: bold; color: #0073aa; } .scc-quick-actions { background: white; padding: 20px; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); margin: 20px 0; } .scc-quick-actions button { margin-right: 10px; margin-bottom: 10px; } </style> </div> <?php } /** * 渲染批量检测页面 */ public function render_bulk_scan_page() { ?> <div class="wrap"> <h1>批量内容检测</h1> <div class="card"> <h2>选择检测范围</h2> <form id="scc-bulk-scan-form"> <table class="form-table"> <tr> <th scope="row">文章类型</th> <td> <select name="post_type" id="post_type"> <option value="post">文章</option> <option value="page">页面</option> <option value="all">所有内容</option>
在当今信息爆炸的互联网环境中,内容创作已成为网站运营的核心。然而,随着内容数量的急剧增加,抄袭和内容重复问题也日益严重。对于WordPress网站管理员和内容创作者而言,保护原创内容不仅是维护品牌声誉的需要,更是提升搜索引擎排名、吸引忠实读者的关键。
传统的防抄袭方法主要依赖人工检查或基础文本比对,效率低下且难以应对海量内容。本教程将指导您通过WordPress代码二次开发,集成智能化的内容相似度检测与防抄袭系统,将您的网站提升到一个新的智能化水平。
在开始开发之前,我们需要明确系统应具备的核心功能:
- 实时内容检测:在文章发布时自动检测内容相似度
- 批量历史内容扫描:对网站现有内容进行全面检查
- 智能相似度算法:采用先进的文本相似度计算方法
- 外部资源比对:能够与互联网上的公开内容进行比对
- 可视化报告系统:直观展示检测结果和相似度分析
- 自动化处理机制:根据预设规则自动处理疑似抄袭内容
我们将采用以下技术方案:
- WordPress钩子机制:利用
save_post、publish_post等动作钩子实现自动化检测 - PHP文本处理库:使用PHP内置函数和扩展进行文本预处理
- 相似度算法:实现余弦相似度、Jaccard相似系数和编辑距离算法
- 外部API集成:通过第三方原创检测API增强检测能力
- 数据库优化:合理设计数据表结构,确保系统性能
- 前端展示:使用AJAX和Chart.js实现交互式报告界面
用户发布内容 → WordPress钩子触发 → 文本预处理 → 特征提取 →
相似度计算 → 结果评估 → 数据库存储 → 报告生成 → 用户通知
用户发布内容 → WordPress钩子触发 → 文本预处理 → 特征提取 →
相似度计算 → 结果评估 → 数据库存储 → 报告生成 → 用户通知
首先,确保您的开发环境满足以下要求:
- WordPress 5.0或更高版本
- PHP 7.3或更高版本(支持mbstring、curl扩展)
- MySQL 5.6或更高版本
- 至少100MB的可用磁盘空间(用于存储文本指纹和缓存)
我们将创建一个独立的WordPress插件来实现所有功能:
- 在
wp-content/plugins/目录下创建新文件夹smart-content-checker - 创建主插件文件
smart-content-checker.php:
<?php
/**
* Plugin Name: 智能内容相似度检测与防抄袭系统
* Plugin URI: https://yourwebsite.com/
* Description: 为WordPress网站提供智能化的内容相似度检测与防抄袭功能
* Version: 1.0.0
* Author: 您的名称
* License: GPL v2 or later
* Text Domain: smart-content-checker
*/
// 防止直接访问
if (!defined('ABSPATH')) {
exit;
}
// 定义插件常量
define('SCC_VERSION', '1.0.0');
define('SCC_PLUGIN_DIR', plugin_dir_path(__FILE__));
define('SCC_PLUGIN_URL', plugin_dir_url(__FILE__));
define('SCC_CACHE_TIME', 3600); // 缓存时间1小时
// 初始化插件
require_once SCC_PLUGIN_DIR . 'includes/class-core.php';
require_once SCC_PLUGIN_DIR . 'includes/class-text-processor.php';
require_once SCC_PLUGIN_DIR . 'includes/class-similarity-checker.php';
require_once SCC_PLUGIN_DIR . 'includes/class-database.php';
require_once SCC_PLUGIN_DIR . 'includes/class-admin-interface.php';
// 启动插件
function scc_init_plugin() {
$core = new SCC_Core();
$core->init();
}
add_action('plugins_loaded', 'scc_init_plugin');
创建includes/class-text-processor.php文件:
<?php
class SCC_Text_Processor {
/**
* 文本清洗和标准化
*/
public function clean_text($text) {
// 移除HTML标签
$text = strip_tags($text);
// 转换所有字符为小写
$text = mb_strtolower($text, 'UTF-8');
// 移除特殊字符和标点符号,保留中文、英文和数字
$text = preg_replace('/[^p{L}p{N}s]/u', ' ', $text);
// 移除多余空格
$text = preg_replace('/s+/', ' ', $text);
return trim($text);
}
/**
* 中文文本分词
* 注意:需要服务器安装中文分词扩展,这里提供简单实现
*/
public function chinese_segmentation($text) {
// 如果服务器安装了scws或jieba分词,可以调用相关函数
// 这里提供一个简单的按字符分割的方法(适用于基础需求)
if (function_exists('scws_new')) {
// 使用scws分词
$so = scws_new();
$so->set_charset('utf8');
$so->send_text($text);
$words = array();
while ($tmp = $so->get_result()) {
foreach ($tmp as $word) {
if (strlen($word['word']) > 1) {
$words[] = $word['word'];
}
}
}
$so->close();
return $words;
} else {
// 简单分词:按空格和标点分割
return preg_split('/s+/', $text);
}
}
/**
* 提取文本特征(词频向量)
*/
public function extract_features($text, $max_features = 100) {
$cleaned_text = $this->clean_text($text);
// 分词
$words = $this->chinese_segmentation($cleaned_text);
// 计算词频
$word_freq = array_count_values($words);
// 移除停用词
$word_freq = $this->remove_stopwords($word_freq);
// 按词频排序并取前N个特征
arsort($word_freq);
$features = array_slice($word_freq, 0, $max_features, true);
return $features;
}
/**
* 移除停用词
*/
private function remove_stopwords($word_freq) {
// 中文停用词列表(部分示例)
$stopwords = array(
'的', '了', '在', '是', '我', '有', '和', '就',
'不', '人', '都', '一', '一个', '上', '也', '很',
'到', '说', '要', '去', '你', '会', '着', '没有',
'看', '好', '自己', '这', '那', '他', '她', '它'
);
// 英文停用词
$english_stopwords = array(
'a', 'an', 'the', 'and', 'or', 'but', 'in', 'on',
'at', 'to', 'for', 'of', 'with', 'by', 'is', 'are',
'was', 'were', 'be', 'been', 'being', 'have', 'has',
'had', 'having', 'do', 'does', 'did', 'doing'
);
$all_stopwords = array_merge($stopwords, $english_stopwords);
foreach ($all_stopwords as $stopword) {
if (isset($word_freq[$stopword])) {
unset($word_freq[$stopword]);
}
}
return $word_freq;
}
/**
* 生成文本指纹(Simhash算法简化版)
*/
public function generate_simhash($text) {
$features = $this->extract_features($text, 64);
$vector = array_fill(0, 64, 0);
foreach ($features as $word => $weight) {
$hash = crc32($word);
for ($i = 0; $i < 64; $i++) {
$bit = ($hash >> $i) & 1;
if ($bit == 1) {
$vector[$i] += $weight;
} else {
$vector[$i] -= $weight;
}
}
}
// 生成64位指纹
$fingerprint = 0;
for ($i = 0; $i < 64; $i++) {
if ($vector[$i] > 0) {
$fingerprint |= (1 << $i);
}
}
return $fingerprint;
}
}
创建includes/class-similarity-checker.php文件:
<?php
class SCC_Similarity_Checker {
private $text_processor;
public function __construct() {
$this->text_processor = new SCC_Text_Processor();
}
/**
* 计算余弦相似度
*/
public function cosine_similarity($text1, $text2) {
$features1 = $this->text_processor->extract_features($text1);
$features2 = $this->text_processor->extract_features($text2);
// 获取所有特征的并集
$all_features = array_unique(array_merge(
array_keys($features1),
array_keys($features2)
));
// 创建向量
$vector1 = array();
$vector2 = array();
foreach ($all_features as $feature) {
$vector1[] = isset($features1[$feature]) ? $features1[$feature] : 0;
$vector2[] = isset($features2[$feature]) ? $features2[$feature] : 0;
}
// 计算点积
$dot_product = 0;
for ($i = 0; $i < count($vector1); $i++) {
$dot_product += $vector1[$i] * $vector2[$i];
}
// 计算模长
$magnitude1 = sqrt(array_sum(array_map(function($x) {
return $x * $x;
}, $vector1)));
$magnitude2 = sqrt(array_sum(array_map(function($x) {
return $x * $x;
}, $vector2)));
// 避免除以零
if ($magnitude1 == 0 || $magnitude2 == 0) {
return 0;
}
return $dot_product / ($magnitude1 * $magnitude2);
}
/**
* 计算Jaccard相似系数
*/
public function jaccard_similarity($text1, $text2) {
$features1 = $this->text_processor->extract_features($text1);
$features2 = $this->text_processor->extract_features($text2);
$set1 = array_keys($features1);
$set2 = array_keys($features2);
$intersection = array_intersect($set1, $set2);
$union = array_unique(array_merge($set1, $set2));
if (count($union) == 0) {
return 0;
}
return count($intersection) / count($union);
}
/**
* 计算Simhash海明距离
*/
public function simhash_distance($text1, $text2) {
$hash1 = $this->text_processor->generate_simhash($text1);
$hash2 = $this->text_processor->generate_simhash($text2);
// 计算海明距离
$xor = $hash1 ^ $hash2;
$distance = 0;
while ($xor) {
$distance += $xor & 1;
$xor >>= 1;
}
return $distance;
}
/**
* 综合相似度评估
*/
public function comprehensive_similarity($text1, $text2) {
$cosine = $this->cosine_similarity($text1, $text2);
$jaccard = $this->jaccard_similarity($text1, $text2);
$simhash_distance = $this->simhash_distance($text1, $text2);
// 将Simhash距离转换为相似度(距离越小,相似度越高)
$simhash_similarity = max(0, 1 - ($simhash_distance / 64));
// 加权平均
$weights = array(
'cosine' => 0.5,
'jaccard' => 0.3,
'simhash' => 0.2
);
$similarity = ($cosine * $weights['cosine']) +
($jaccard * $weights['jaccard']) +
($simhash_similarity * $weights['simhash']);
return round($similarity, 4);
}
/**
* 与外部API集成进行深度检测
*/
public function external_api_check($text, $api_type = 'copyscape') {
// 这里以Copyscape API为例
$api_key = get_option('scc_copyscape_api_key', '');
if (empty($api_key)) {
return array(
'success' => false,
'message' => 'API密钥未配置'
);
}
$encoded_text = urlencode($text);
$url = "https://www.copyscape.com/api/?o=search&k={$api_key}&t={$encoded_text}&f=xml";
$response = wp_remote_get($url, array(
'timeout' => 30,
'sslverify' => false
));
if (is_wp_error($response)) {
return array(
'success' => false,
'message' => $response->get_error_message()
);
}
$body = wp_remote_retrieve_body($response);
// 解析XML响应
$xml = simplexml_load_string($body);
if (!$xml) {
return array(
'success' => false,
'message' => 'API响应解析失败'
);
}
$results = array();
if (isset($xml->result)) {
foreach ($xml->result as $result) {
$results[] = array(
'url' => (string)$result->url,
'title' => (string)$result->title,
'similarity' => (float)$result->minwordsmatched / 100
);
}
}
return array(
'success' => true,
'results' => $results
);
}
}
创建includes/class-database.php文件:
<?php
class SCC_Database {
/**
* 创建必要的数据库表
*/
public function create_tables() {
global $wpdb;
$charset_collate = $wpdb->get_charset_collate();
$table_name = $wpdb->prefix . 'scc_content_fingerprints';
$results_table = $wpdb->prefix . 'scc_scan_results';
// 内容指纹表
$sql1 = "CREATE TABLE IF NOT EXISTS $table_name (
id bigint(20) NOT NULL AUTO_INCREMENT,
post_id bigint(20) NOT NULL,
fingerprint_64 bigint(20) UNSIGNED NOT NULL,
fingerprint_128 varchar(255) DEFAULT NULL,
content_hash varchar(64) NOT NULL,
created_at datetime DEFAULT CURRENT_TIMESTAMP,
updated_at datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP,
PRIMARY KEY (id),
KEY post_id (post_id),
KEY fingerprint_64 (fingerprint_64),
KEY content_hash (content_hash)
) $charset_collate;";
// 扫描结果表
$sql2 = "CREATE TABLE IF NOT EXISTS $results_table (
id bigint(20) NOT NULL AUTO_INCREMENT,
post_id bigint(20) NOT NULL,
scan_type varchar(50) NOT NULL,
similarity_score float NOT NULL,
matched_urls text,
details text,
scan_date datetime DEFAULT CURRENT_TIMESTAMP,
status varchar(20) DEFAULT 'pending',
PRIMARY KEY (id),
KEY post_id (post_id),
KEY scan_date (scan_date),
KEY status (status)
) $charset_collate;";
require_once(ABSPATH . 'wp-admin/includes/upgrade.php');
dbDelta($sql1);
dbDelta($sql2);
}
/**
* 保存内容指纹
*/
public function save_fingerprint($post_id, $fingerprint_64, $content_hash, $fingerprint_128 = null) {
global $wpdb;
$table_name = $wpdb->prefix . 'scc_content_fingerprints';
// 检查是否已存在
$existing = $wpdb->get_var($wpdb->prepare(
"SELECT id FROM $table_name WHERE post_id = %d",
$post_id
));
if ($existing) {
// 更新现有记录
$wpdb->update(
$table_name,
array(
'fingerprint_64' => $fingerprint_64,
'fingerprint_128' => $fingerprint_128,
'content_hash' => $content_hash,
'updated_at' => current_time('mysql')
),
),
array('%d', '%s', '%s', '%s'),
array('%d')
);
} else {
// 插入新记录
$wpdb->insert(
$table_name,
array(
'post_id' => $post_id,
'fingerprint_64' => $fingerprint_64,
'fingerprint_128' => $fingerprint_128,
'content_hash' => $content_hash
),
array('%d', '%d', '%s', '%s')
);
}
return $wpdb->insert_id;
}
/**
* 查找相似内容
*/
public function find_similar_content($fingerprint_64, $threshold = 5, $exclude_post_id = 0) {
global $wpdb;
$table_name = $wpdb->prefix . 'scc_content_fingerprints';
// 查找海明距离小于阈值的指纹
$query = $wpdb->prepare(
"SELECT p1.post_id,
BIT_COUNT(p1.fingerprint_64 ^ %d) as hamming_distance,
p.post_title,
p.post_date
FROM $table_name p1
INNER JOIN {$wpdb->posts} p ON p1.post_id = p.ID
WHERE BIT_COUNT(p1.fingerprint_64 ^ %d) <= %d
AND p1.post_id != %d
AND p.post_status = 'publish'
ORDER BY hamming_distance ASC
LIMIT 10",
$fingerprint_64,
$fingerprint_64,
$threshold,
$exclude_post_id
);
return $wpdb->get_results($query, ARRAY_A);
}
/**
* 保存扫描结果
*/
public function save_scan_result($post_id, $scan_type, $similarity_score, $matched_urls = '', $details = '') {
global $wpdb;
$table_name = $wpdb->prefix . 'scc_scan_results';
$wpdb->insert(
$table_name,
array(
'post_id' => $post_id,
'scan_type' => $scan_type,
'similarity_score' => $similarity_score,
'matched_urls' => is_array($matched_urls) ? json_encode($matched_urls) : $matched_urls,
'details' => is_array($details) ? json_encode($details) : $details,
'status' => $similarity_score > 0.7 ? 'high_risk' : ($similarity_score > 0.3 ? 'medium_risk' : 'low_risk')
),
array('%d', '%s', '%f', '%s', '%s', '%s')
);
return $wpdb->insert_id;
}
/**
* 获取文章的扫描历史
*/
public function get_scan_history($post_id, $limit = 10) {
global $wpdb;
$table_name = $wpdb->prefix . 'scc_scan_results';
return $wpdb->get_results($wpdb->prepare(
"SELECT * FROM $table_name
WHERE post_id = %d
ORDER BY scan_date DESC
LIMIT %d",
$post_id,
$limit
), ARRAY_A);
}
/**
* 获取高风险内容统计
*/
public function get_risk_statistics($days = 30) {
global $wpdb;
$table_name = $wpdb->prefix . 'scc_scan_results';
$query = $wpdb->prepare(
"SELECT
COUNT(CASE WHEN status = 'high_risk' THEN 1 END) as high_risk_count,
COUNT(CASE WHEN status = 'medium_risk' THEN 1 END) as medium_risk_count,
COUNT(CASE WHEN status = 'low_risk' THEN 1 END) as low_risk_count,
DATE(scan_date) as scan_date
FROM $table_name
WHERE scan_date >= DATE_SUB(NOW(), INTERVAL %d DAY)
GROUP BY DATE(scan_date)
ORDER BY scan_date DESC",
$days
);
return $wpdb->get_results($query, ARRAY_A);
}
}
创建includes/class-core.php文件:
<?php
class SCC_Core {
private $db;
private $text_processor;
private $similarity_checker;
public function __construct() {
$this->db = new SCC_Database();
$this->text_processor = new SCC_Text_Processor();
$this->similarity_checker = new SCC_Similarity_Checker();
}
/**
* 初始化插件
*/
public function init() {
// 创建数据库表
register_activation_hook(__FILE__, array($this->db, 'create_tables'));
// 添加WordPress钩子
add_action('save_post', array($this, 'on_post_save'), 10, 3);
add_action('publish_post', array($this, 'on_post_publish'), 10, 2);
// 添加管理菜单
add_action('admin_menu', array($this, 'add_admin_menu'));
// 添加AJAX处理
add_action('wp_ajax_scc_manual_scan', array($this, 'ajax_manual_scan'));
add_action('wp_ajax_scc_bulk_scan', array($this, 'ajax_bulk_scan'));
// 添加文章列表列
add_filter('manage_posts_columns', array($this, 'add_post_columns'));
add_action('manage_posts_custom_column', array($this, 'render_post_columns'), 10, 2);
// 添加文章编辑页面元框
add_action('add_meta_boxes', array($this, 'add_meta_boxes'));
}
/**
* 文章保存时触发
*/
public function on_post_save($post_id, $post, $update) {
// 跳过自动保存和修订
if (defined('DOING_AUTOSAVE') && DOING_AUTOSAVE) {
return;
}
if (wp_is_post_revision($post_id)) {
return;
}
// 只处理特定文章类型
$allowed_types = array('post', 'page');
if (!in_array($post->post_type, $allowed_types)) {
return;
}
// 获取文章内容
$content = $post->post_content;
// 生成内容哈希
$content_hash = md5($content);
// 生成指纹
$fingerprint_64 = $this->text_processor->generate_simhash($content);
// 保存指纹
$this->db->save_fingerprint($post_id, $fingerprint_64, $content_hash);
// 如果是更新操作,检查与历史版本的相似度
if ($update) {
$this->check_self_similarity($post_id, $content);
}
}
/**
* 文章发布时触发
*/
public function on_post_publish($post_id, $post) {
// 执行相似度检测
$this->perform_similarity_check($post_id, $post->post_content);
}
/**
* 执行相似度检测
*/
private function perform_similarity_check($post_id, $content) {
// 1. 内部相似度检测
$fingerprint_64 = $this->text_processor->generate_simhash($content);
$similar_posts = $this->db->find_similar_content($fingerprint_64, 5, $post_id);
$internal_similarity = 0;
$matched_posts = array();
if (!empty($similar_posts)) {
foreach ($similar_posts as $similar_post) {
$similar_post_content = get_post_field('post_content', $similar_post['post_id']);
$similarity = $this->similarity_checker->comprehensive_similarity($content, $similar_post_content);
if ($similarity > $internal_similarity) {
$internal_similarity = $similarity;
}
if ($similarity > 0.3) {
$matched_posts[] = array(
'post_id' => $similar_post['post_id'],
'title' => $similar_post['post_title'],
'similarity' => $similarity,
'url' => get_permalink($similar_post['post_id'])
);
}
}
}
// 2. 外部API检测(可选)
$external_results = array();
if (get_option('scc_enable_external_check', false)) {
$external_check = $this->similarity_checker->external_api_check($content);
if ($external_check['success']) {
$external_results = $external_check['results'];
}
}
// 计算综合相似度
$total_similarity = $internal_similarity;
if (!empty($external_results)) {
$external_similarity = max(array_column($external_results, 'similarity'));
$total_similarity = max($total_similarity, $external_similarity);
}
// 保存结果
$this->db->save_scan_result(
$post_id,
'auto',
$total_similarity,
array_merge($matched_posts, $external_results),
array(
'internal_similarity' => $internal_similarity,
'matched_posts' => $matched_posts,
'external_results' => $external_results
)
);
// 发送通知
if ($total_similarity > get_option('scc_notification_threshold', 0.7)) {
$this->send_notification($post_id, $total_similarity);
}
return $total_similarity;
}
/**
* 检查与历史版本的相似度
*/
private function check_self_similarity($post_id, $current_content) {
$revisions = wp_get_post_revisions($post_id);
if (empty($revisions)) {
return;
}
// 获取最新修订版
$latest_revision = reset($revisions);
$revision_content = $latest_revision->post_content;
$similarity = $this->similarity_checker->comprehensive_similarity($current_content, $revision_content);
// 如果相似度低于阈值,记录重大修改
if ($similarity < 0.5) {
$this->db->save_scan_result(
$post_id,
'revision_check',
$similarity,
array(),
array(
'message' => '检测到文章内容发生重大修改',
'revision_id' => $latest_revision->ID,
'similarity_with_revision' => $similarity
)
);
}
}
/**
* 发送通知
*/
private function send_notification($post_id, $similarity) {
$post = get_post($post_id);
$author = get_userdata($post->post_author);
$admin_email = get_option('admin_email');
$subject = sprintf('【内容相似度警报】文章 "%s" 检测到高相似度内容', $post->post_title);
$message = sprintf(
"文章标题:%sn" .
"文章ID:%dn" .
"作者:%sn" .
"检测相似度:%.2f%%n" .
"文章链接:%sn" .
"编辑链接:%snn" .
"请及时审核该文章内容。",
$post->post_title,
$post_id,
$author->display_name,
$similarity * 100,
get_permalink($post_id),
admin_url('post.php?post=' . $post_id . '&action=edit')
);
// 发送给管理员
wp_mail($admin_email, $subject, $message);
// 如果设置了作者通知,也发送给作者
if (get_option('scc_notify_author', false)) {
wp_mail($author->user_email, $subject, $message);
}
}
}
创建includes/class-admin-interface.php文件:
<?php
class SCC_Admin_Interface {
private $db;
public function __construct() {
$this->db = new SCC_Database();
}
/**
* 添加管理菜单
*/
public function add_admin_menu() {
// 主菜单
add_menu_page(
'内容相似度检测',
'内容检测',
'manage_options',
'scc-dashboard',
array($this, 'render_dashboard_page'),
'dashicons-search',
30
);
// 子菜单
add_submenu_page(
'scc-dashboard',
'批量检测',
'批量检测',
'manage_options',
'scc-bulk-scan',
array($this, 'render_bulk_scan_page')
);
add_submenu_page(
'scc-dashboard',
'检测设置',
'设置',
'manage_options',
'scc-settings',
array($this, 'render_settings_page')
);
add_submenu_page(
'scc-dashboard',
'检测报告',
'报告统计',
'manage_options',
'scc-reports',
array($this, 'render_reports_page')
);
}
/**
* 渲染仪表盘页面
*/
public function render_dashboard_page() {
?>
<div class="wrap scc-dashboard">
<h1><?php echo esc_html(get_admin_page_title()); ?></h1>
<div class="scc-stats-container">
<div class="scc-stat-card">
<h3>今日检测</h3>
<div class="stat-number"><?php echo $this->get_today_scan_count(); ?></div>
</div>
<div class="scc-stat-card">
<h3>高风险内容</h3>
<div class="stat-number" style="color: #dc3232;"><?php echo $this->get_high_risk_count(); ?></div>
</div>
<div class="scc-stat-card">
<h3>平均相似度</h3>
<div class="stat-number"><?php echo $this->get_average_similarity(); ?>%</div>
</div>
<div class="scc-stat-card">
<h3>已保护文章</h3>
<div class="stat-number"><?php echo $this->get_protected_post_count(); ?></div>
</div>
</div>
<div class="scc-quick-actions">
<h2>快速操作</h2>
<button class="button button-primary" onclick="sccQuickScan()">快速扫描最新文章</button>
<button class="button" onclick="window.location.href='?page=scc-bulk-scan'">批量检测</button>
<button class="button" onclick="window.location.href='?page=scc-reports'">查看完整报告</button>
</div>
<div class="scc-recent-scans">
<h2>最近检测记录</h2>
<?php $this->render_recent_scans_table(); ?>
</div>
<script>
function sccQuickScan() {
jQuery.post(ajaxurl, {
action: 'scc_manual_scan',
post_ids: 'recent',
nonce: '<?php echo wp_create_nonce('scc_manual_scan'); ?>'
}, function(response) {
if (response.success) {
alert('扫描完成!检测到 ' + response.data.high_risk + ' 篇高风险文章');
location.reload();
} else {
alert('扫描失败:' + response.data);
}
});
}
</script>
<style>
.scc-stats-container {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 20px;
margin: 20px 0;
}
.scc-stat-card {
background: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
text-align: center;
}
.scc-stat-card h3 {
margin-top: 0;
color: #666;
}
.scc-stat-card .stat-number {
font-size: 2.5em;
font-weight: bold;
color: #0073aa;
}
.scc-quick-actions {
background: white;
padding: 20px;
border-radius: 8px;
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
margin: 20px 0;
}
.scc-quick-actions button {
margin-right: 10px;
margin-bottom: 10px;
}
</style>
</div>
<?php
}
/**
* 渲染批量检测页面
*/
public function render_bulk_scan_page() {
?>
<div class="wrap">
<h1>批量内容检测</h1>
<div class="card">
<h2>选择检测范围</h2>
<form id="scc-bulk-scan-form">
<table class="form-table">
<tr>
<th scope="row">文章类型</th>
<td>
<select name="post_type" id="post_type">
<option value="post">文章</option>
<option value="page">页面</option>
<option value="all">所有内容</option>


