修改成redis
This commit is contained in:
@@ -0,0 +1,98 @@
|
||||
-- ============================================
|
||||
-- sese-engine MySQL 初始化脚本
|
||||
-- 自动创建数据库(如果不存在)并创建表结构
|
||||
-- ============================================
|
||||
|
||||
-- 创建数据库(如果不存在)
|
||||
CREATE DATABASE IF NOT EXISTS sese_engine DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
CREATE DATABASE IF NOT EXISTS sese_test2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
-- ============================================
|
||||
-- 倒排索引表
|
||||
-- 存储关键词到URL的映射及其权重
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS index_entries (
|
||||
keyword VARCHAR(255) NOT NULL COMMENT '关键词',
|
||||
url VARCHAR(2048) NOT NULL COMMENT 'URL地址',
|
||||
weight FLOAT NOT NULL COMMENT '权重分数',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
PRIMARY KEY (keyword(255), url(255)),
|
||||
INDEX idx_keyword (keyword),
|
||||
INDEX idx_weight (weight),
|
||||
INDEX idx_url (url(255))
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='倒排索引表';
|
||||
|
||||
-- ============================================
|
||||
-- URL 摘要表
|
||||
-- 存储每个URL的标题、描述、正文片段等
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS url_snippets (
|
||||
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
|
||||
url VARCHAR(2048) NOT NULL COMMENT 'URL地址(唯一)',
|
||||
url_hash VARCHAR(64) NOT NULL COMMENT 'URL的MD5哈希(用于快速查询)',
|
||||
title VARCHAR(512) COMMENT '页面标题',
|
||||
description TEXT COMMENT 'meta description或自动生成的描述',
|
||||
text MEDIUMTEXT COMMENT '正文文本片段',
|
||||
timestamp BIGINT COMMENT '抓取时间戳',
|
||||
content_hash VARCHAR(64) COMMENT '内容哈希(用于增量检测)',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
UNIQUE KEY uk_url (url(255)),
|
||||
INDEX idx_url_hash (url_hash),
|
||||
INDEX idx_timestamp (timestamp)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='URL摘要缓存表';
|
||||
|
||||
-- ============================================
|
||||
-- 网站信息表
|
||||
-- 存储每个域名的元信息
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS site_info (
|
||||
host VARCHAR(255) PRIMARY KEY COMMENT '域名/主机名',
|
||||
visit_count INT UNSIGNED DEFAULT 0 COMMENT '访问次数',
|
||||
last_visit_time BIGINT COMMENT '最后访问时间戳',
|
||||
fingerprint TEXT COMMENT '网站指纹',
|
||||
success_rate FLOAT COMMENT '成功率',
|
||||
html_structure TEXT COMMENT 'HTML结构特征',
|
||||
ips JSON COMMENT 'IP地址列表',
|
||||
quality FLOAT COMMENT '质量评分',
|
||||
https_available TINYINT COMMENT '是否支持HTTPS(1=是,0=否)',
|
||||
keywords JSON COMMENT '高频关键词列表',
|
||||
out_links JSON COMMENT '出站链接列表',
|
||||
languages JSON COMMENT '语种分布',
|
||||
redirects JSON COMMENT '重定向链',
|
||||
server_types JSON COMMENT 'Server类型',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间'
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='网站元信息表';
|
||||
|
||||
-- ============================================
|
||||
-- 刷盘记录表
|
||||
-- 用于断点续传,记录刷盘进度
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS flush_marker (
|
||||
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
|
||||
flush_type VARCHAR(50) NOT NULL COMMENT '刷盘类型:index, gate, site',
|
||||
last_key VARCHAR(255) COMMENT '最后处理的key',
|
||||
last_cursor BIGINT DEFAULT 0 COMMENT 'Redis SCAN游标',
|
||||
processed_count INT UNSIGNED DEFAULT 0 COMMENT '本批次处理数量',
|
||||
flush_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '刷盘时间',
|
||||
INDEX idx_type (flush_type),
|
||||
INDEX idx_time (flush_time)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='刷盘进度记录表';
|
||||
|
||||
-- ============================================
|
||||
-- 优先爬取URL表
|
||||
-- 存储需要优先爬取的URL
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS priority_urls (
|
||||
url VARCHAR(2048) NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (url(255)),
|
||||
INDEX idx_created (created_at)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='优先爬取URL表';
|
||||
Reference in New Issue
Block a user