99 lines
4.6 KiB
SQL
99 lines
4.6 KiB
SQL
-- ============================================
|
||
-- sese-engine MySQL 初始化脚本
|
||
-- 自动创建数据库(如果不存在)并创建表结构
|
||
-- ============================================
|
||
|
||
-- 创建数据库(如果不存在)
|
||
CREATE DATABASE IF NOT EXISTS sese_engine DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||
CREATE DATABASE IF NOT EXISTS sese_test2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||
|
||
-- ============================================
|
||
-- 倒排索引表
|
||
-- 存储关键词到URL的映射及其权重
|
||
-- ============================================
|
||
CREATE TABLE IF NOT EXISTS index_entries (
|
||
keyword VARCHAR(255) NOT NULL COMMENT '关键词',
|
||
url VARCHAR(2048) NOT NULL COMMENT 'URL地址',
|
||
weight FLOAT NOT NULL COMMENT '权重分数',
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||
PRIMARY KEY (keyword(255), url(255)),
|
||
INDEX idx_keyword (keyword),
|
||
INDEX idx_weight (weight),
|
||
INDEX idx_url (url(255))
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||
COMMENT='倒排索引表';
|
||
|
||
-- ============================================
|
||
-- URL 摘要表
|
||
-- 存储每个URL的标题、描述、正文片段等
|
||
-- ============================================
|
||
CREATE TABLE IF NOT EXISTS url_snippets (
|
||
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
|
||
url VARCHAR(2048) NOT NULL COMMENT 'URL地址(唯一)',
|
||
url_hash VARCHAR(64) NOT NULL COMMENT 'URL的MD5哈希(用于快速查询)',
|
||
title VARCHAR(512) COMMENT '页面标题',
|
||
description TEXT COMMENT 'meta description或自动生成的描述',
|
||
text MEDIUMTEXT COMMENT '正文文本片段',
|
||
timestamp BIGINT COMMENT '抓取时间戳',
|
||
content_hash VARCHAR(64) COMMENT '内容哈希(用于增量检测)',
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||
UNIQUE KEY uk_url (url(255)),
|
||
INDEX idx_url_hash (url_hash),
|
||
INDEX idx_timestamp (timestamp)
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||
COMMENT='URL摘要缓存表';
|
||
|
||
-- ============================================
|
||
-- 网站信息表
|
||
-- 存储每个域名的元信息
|
||
-- ============================================
|
||
CREATE TABLE IF NOT EXISTS site_info (
|
||
host VARCHAR(255) PRIMARY KEY COMMENT '域名/主机名',
|
||
visit_count INT UNSIGNED DEFAULT 0 COMMENT '访问次数',
|
||
last_visit_time BIGINT COMMENT '最后访问时间戳',
|
||
fingerprint TEXT COMMENT '网站指纹',
|
||
success_rate FLOAT COMMENT '成功率',
|
||
html_structure TEXT COMMENT 'HTML结构特征',
|
||
ips JSON COMMENT 'IP地址列表',
|
||
quality FLOAT COMMENT '质量评分',
|
||
https_available TINYINT COMMENT '是否支持HTTPS(1=是,0=否)',
|
||
keywords JSON COMMENT '高频关键词列表',
|
||
out_links JSON COMMENT '出站链接列表',
|
||
languages JSON COMMENT '语种分布',
|
||
redirects JSON COMMENT '重定向链',
|
||
server_types JSON COMMENT 'Server类型',
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间'
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||
COMMENT='网站元信息表';
|
||
|
||
-- ============================================
|
||
-- 刷盘记录表
|
||
-- 用于断点续传,记录刷盘进度
|
||
-- ============================================
|
||
CREATE TABLE IF NOT EXISTS flush_marker (
|
||
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
|
||
flush_type VARCHAR(50) NOT NULL COMMENT '刷盘类型:index, gate, site',
|
||
last_key VARCHAR(255) COMMENT '最后处理的key',
|
||
last_cursor BIGINT DEFAULT 0 COMMENT 'Redis SCAN游标',
|
||
processed_count INT UNSIGNED DEFAULT 0 COMMENT '本批次处理数量',
|
||
flush_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '刷盘时间',
|
||
INDEX idx_type (flush_type),
|
||
INDEX idx_time (flush_time)
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||
COMMENT='刷盘进度记录表';
|
||
|
||
-- ============================================
|
||
-- 优先爬取URL表
|
||
-- 存储需要优先爬取的URL
|
||
-- ============================================
|
||
CREATE TABLE IF NOT EXISTS priority_urls (
|
||
url VARCHAR(2048) NOT NULL,
|
||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||
PRIMARY KEY (url(255)),
|
||
INDEX idx_created (created_at)
|
||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||
COMMENT='优先爬取URL表';
|