Files
sese-engine-go/mysql/init_db.sql
T
2026-04-20 18:26:54 +08:00

99 lines
4.6 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-- ============================================
-- sese-engine MySQL 初始化脚本
-- 自动创建数据库(如果不存在)并创建表结构
-- ============================================
-- 创建数据库(如果不存在)
CREATE DATABASE IF NOT EXISTS sese_engine DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
CREATE DATABASE IF NOT EXISTS sese_test2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
-- ============================================
-- 倒排索引表
-- 存储关键词到URL的映射及其权重
-- ============================================
CREATE TABLE IF NOT EXISTS index_entries (
keyword VARCHAR(255) NOT NULL COMMENT '关键词',
url VARCHAR(2048) NOT NULL COMMENT 'URL地址',
weight FLOAT NOT NULL COMMENT '权重分数',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
PRIMARY KEY (keyword(255), url(255)),
INDEX idx_keyword (keyword),
INDEX idx_weight (weight),
INDEX idx_url (url(255))
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='倒排索引表';
-- ============================================
-- URL 摘要表
-- 存储每个URL的标题、描述、正文片段等
-- ============================================
CREATE TABLE IF NOT EXISTS url_snippets (
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
url VARCHAR(2048) NOT NULL COMMENT 'URL地址(唯一)',
url_hash VARCHAR(64) NOT NULL COMMENT 'URL的MD5哈希(用于快速查询)',
title VARCHAR(512) COMMENT '页面标题',
description TEXT COMMENT 'meta description或自动生成的描述',
text MEDIUMTEXT COMMENT '正文文本片段',
timestamp BIGINT COMMENT '抓取时间戳',
content_hash VARCHAR(64) COMMENT '内容哈希(用于增量检测)',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
UNIQUE KEY uk_url (url(255)),
INDEX idx_url_hash (url_hash),
INDEX idx_timestamp (timestamp)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='URL摘要缓存表';
-- ============================================
-- 网站信息表
-- 存储每个域名的元信息
-- ============================================
CREATE TABLE IF NOT EXISTS site_info (
host VARCHAR(255) PRIMARY KEY COMMENT '域名/主机名',
visit_count INT UNSIGNED DEFAULT 0 COMMENT '访问次数',
last_visit_time BIGINT COMMENT '最后访问时间戳',
fingerprint TEXT COMMENT '网站指纹',
success_rate FLOAT COMMENT '成功率',
html_structure TEXT COMMENT 'HTML结构特征',
ips JSON COMMENT 'IP地址列表',
quality FLOAT COMMENT '质量评分',
https_available TINYINT COMMENT '是否支持HTTPS1=是,0=否)',
keywords JSON COMMENT '高频关键词列表',
out_links JSON COMMENT '出站链接列表',
languages JSON COMMENT '语种分布',
redirects JSON COMMENT '重定向链',
server_types JSON COMMENT 'Server类型',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间'
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='网站元信息表';
-- ============================================
-- 刷盘记录表
-- 用于断点续传,记录刷盘进度
-- ============================================
CREATE TABLE IF NOT EXISTS flush_marker (
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
flush_type VARCHAR(50) NOT NULL COMMENT '刷盘类型:index, gate, site',
last_key VARCHAR(255) COMMENT '最后处理的key',
last_cursor BIGINT DEFAULT 0 COMMENT 'Redis SCAN游标',
processed_count INT UNSIGNED DEFAULT 0 COMMENT '本批次处理数量',
flush_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '刷盘时间',
INDEX idx_type (flush_type),
INDEX idx_time (flush_time)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='刷盘进度记录表';
-- ============================================
-- 优先爬取URL表
-- 存储需要优先爬取的URL
-- ============================================
CREATE TABLE IF NOT EXISTS priority_urls (
url VARCHAR(2048) NOT NULL,
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (url(255)),
INDEX idx_created (created_at)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
COMMENT='优先爬取URL表';