-- ============================================ -- sese-engine MySQL 初始化脚本 -- 自动创建数据库(如果不存在)并创建表结构 -- ============================================ -- 创建数据库(如果不存在) CREATE DATABASE IF NOT EXISTS sese_engine DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; CREATE DATABASE IF NOT EXISTS sese_test2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; -- ============================================ -- 倒排索引表 -- 存储关键词到URL的映射及其权重 -- ============================================ CREATE TABLE IF NOT EXISTS index_entries ( keyword VARCHAR(255) NOT NULL COMMENT '关键词', url VARCHAR(2048) NOT NULL COMMENT 'URL地址', weight FLOAT NOT NULL COMMENT '权重分数', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', PRIMARY KEY (keyword(255), url(255)), INDEX idx_keyword (keyword), INDEX idx_weight (weight), INDEX idx_url (url(255)) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='倒排索引表'; -- ============================================ -- URL 摘要表 -- 存储每个URL的标题、描述、正文片段等 -- ============================================ CREATE TABLE IF NOT EXISTS url_snippets ( id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, url VARCHAR(2048) NOT NULL COMMENT 'URL地址(唯一)', url_hash VARCHAR(64) NOT NULL COMMENT 'URL的MD5哈希(用于快速查询)', title VARCHAR(512) COMMENT '页面标题', description TEXT COMMENT 'meta description或自动生成的描述', text MEDIUMTEXT COMMENT '正文文本片段', timestamp BIGINT COMMENT '抓取时间戳', content_hash VARCHAR(64) COMMENT '内容哈希(用于增量检测)', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间', UNIQUE KEY uk_url (url(255)), INDEX idx_url_hash (url_hash), INDEX idx_timestamp (timestamp) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='URL摘要缓存表'; -- ============================================ -- 网站信息表 -- 存储每个域名的元信息 -- ============================================ CREATE TABLE IF NOT EXISTS site_info ( host VARCHAR(255) PRIMARY KEY COMMENT '域名/主机名', visit_count INT UNSIGNED DEFAULT 0 COMMENT '访问次数', last_visit_time BIGINT COMMENT '最后访问时间戳', fingerprint TEXT COMMENT '网站指纹', success_rate FLOAT COMMENT '成功率', html_structure TEXT COMMENT 'HTML结构特征', ips JSON COMMENT 'IP地址列表', quality FLOAT COMMENT '质量评分', https_available TINYINT COMMENT '是否支持HTTPS(1=是,0=否)', keywords JSON COMMENT '高频关键词列表', out_links JSON COMMENT '出站链接列表', languages JSON COMMENT '语种分布', redirects JSON COMMENT '重定向链', server_types JSON COMMENT 'Server类型', created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间', updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间' ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='网站元信息表'; -- ============================================ -- 刷盘记录表 -- 用于断点续传,记录刷盘进度 -- ============================================ CREATE TABLE IF NOT EXISTS flush_marker ( id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY, flush_type VARCHAR(50) NOT NULL COMMENT '刷盘类型:index, gate, site', last_key VARCHAR(255) COMMENT '最后处理的key', last_cursor BIGINT DEFAULT 0 COMMENT 'Redis SCAN游标', processed_count INT UNSIGNED DEFAULT 0 COMMENT '本批次处理数量', flush_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '刷盘时间', INDEX idx_type (flush_type), INDEX idx_time (flush_time) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='刷盘进度记录表'; -- ============================================ -- 优先爬取URL表 -- 存储需要优先爬取的URL -- ============================================ CREATE TABLE IF NOT EXISTS priority_urls ( url VARCHAR(2048) NOT NULL, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (url(255)), INDEX idx_created (created_at) ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci COMMENT='优先爬取URL表';