修改成redis
This commit is contained in:
@@ -0,0 +1,445 @@
|
||||
package mysql
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"log"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
goredis "github.com/redis/go-redis/v9"
|
||||
)
|
||||
|
||||
// Flusher 管理 Redis → MySQL 刷盘任务
|
||||
type Flusher struct {
|
||||
redisDB *goredis.Client // Redis 客户端引用
|
||||
interval time.Duration // 刷盘间隔
|
||||
batchSize int // 每批次处理数量
|
||||
mu sync.Mutex // 防止并发刷盘
|
||||
stopCh chan struct{}
|
||||
wg sync.WaitGroup
|
||||
}
|
||||
|
||||
// NewFlusher 创建刷盘器
|
||||
func NewFlusher(redisDB *goredis.Client, interval time.Duration, batchSize int) *Flusher {
|
||||
if interval <= 0 {
|
||||
interval = 5 * time.Minute
|
||||
}
|
||||
if batchSize <= 0 {
|
||||
batchSize = 1000
|
||||
}
|
||||
return &Flusher{
|
||||
redisDB: redisDB,
|
||||
interval: interval,
|
||||
batchSize: batchSize,
|
||||
stopCh: make(chan struct{}),
|
||||
}
|
||||
}
|
||||
|
||||
// Start 启动后台刷盘任务
|
||||
func (f *Flusher) Start() {
|
||||
f.wg.Add(1)
|
||||
go func() {
|
||||
defer f.wg.Done()
|
||||
ticker := time.NewTicker(f.interval)
|
||||
defer ticker.Stop()
|
||||
|
||||
for {
|
||||
select {
|
||||
case <-ticker.C:
|
||||
f.RunAll()
|
||||
case <-f.stopCh:
|
||||
log.Printf("[mysql-flusher] stopped")
|
||||
return
|
||||
}
|
||||
}
|
||||
}()
|
||||
log.Printf("[mysql-flusher] started (interval=%v, batchSize=%d)", f.interval, f.batchSize)
|
||||
}
|
||||
|
||||
// Stop 停止刷盘任务
|
||||
func (f *Flusher) Stop() {
|
||||
close(f.stopCh)
|
||||
f.wg.Wait()
|
||||
}
|
||||
|
||||
// RunAll 执行所有类型的刷盘
|
||||
func (f *Flusher) RunAll() {
|
||||
f.mu.Lock()
|
||||
defer f.mu.Unlock()
|
||||
|
||||
start := time.Now()
|
||||
log.Printf("[mysql-flusher] === starting flush ===")
|
||||
|
||||
// 刷盘顺序:snippet → site → index(按数据量从小到大)
|
||||
f.flushSnippets()
|
||||
f.flushSites()
|
||||
f.flushIndex()
|
||||
f.flushPriorityURLs()
|
||||
|
||||
log.Printf("[mysql-flusher] === flush done (took %v) ===", time.Since(start))
|
||||
}
|
||||
|
||||
// flushSnippets 将 Redis gate:* 数据刷到 url_snippets 表
|
||||
func (f *Flusher) flushSnippets() {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
total := 0
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := f.redisDB.Scan(ctx, cursor, "gate:*", int64(f.batchSize)).Result()
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][snippets] scan error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(keys) > 0 {
|
||||
f.batchUpsertSnippets(ctx, keys)
|
||||
total += len(keys)
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if total > 0 {
|
||||
log.Printf("[mysql-flusher][snippets] flushed %d entries", total)
|
||||
}
|
||||
}
|
||||
|
||||
// batchUpsertSnippets 批量 upsert url_snippets
|
||||
func (f *Flusher) batchUpsertSnippets(ctx context.Context, keys []string) {
|
||||
if len(keys) == 0 || DB == nil {
|
||||
return
|
||||
}
|
||||
|
||||
query := `INSERT INTO url_snippets (url, url_hash, title, description, text, timestamp, content_hash)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
title = VALUES(title),
|
||||
description = VALUES(description),
|
||||
text = VALUES(text),
|
||||
timestamp = VALUES(timestamp),
|
||||
content_hash = VALUES(content_hash)`
|
||||
|
||||
tx, err := DB.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][snippets] begin tx error: %v", err)
|
||||
return
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.PrepareContext(ctx, query)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][snippets] prepare error: %v", err)
|
||||
return
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, key := range keys {
|
||||
data, err := f.redisDB.HGetAll(ctx, key).Result()
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
url := data["url"]
|
||||
urlHash := data["url_hash"]
|
||||
if urlHash == "" {
|
||||
// 从 key 中提取 hash(key 格式:gate:<hash>)
|
||||
urlHash = strings.TrimPrefix(key, "gate:")
|
||||
}
|
||||
|
||||
title := data["title"]
|
||||
description := data["desc"]
|
||||
text := data["text"]
|
||||
ts := parseInt64(data["ts"])
|
||||
contentHash := data["hash"]
|
||||
|
||||
_, err = stmt.ExecContext(ctx, url, urlHash, title, description, text, ts, contentHash)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][snippets] exec error for %s: %v", url, err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
log.Printf("[mysql-flusher][snippets] commit error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// flushSites 将 Redis site:* 数据刷到 site_info 表
|
||||
func (f *Flusher) flushSites() {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
total := 0
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := f.redisDB.Scan(ctx, cursor, "site:*", int64(f.batchSize)).Result()
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][sites] scan error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(keys) > 0 {
|
||||
f.batchUpsertSites(ctx, keys)
|
||||
total += len(keys)
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if total > 0 {
|
||||
log.Printf("[mysql-flusher][sites] flushed %d entries", total)
|
||||
}
|
||||
}
|
||||
|
||||
// batchUpsertSites 批量 upsert site_info
|
||||
func (f *Flusher) batchUpsertSites(ctx context.Context, keys []string) {
|
||||
if len(keys) == 0 || DB == nil {
|
||||
return
|
||||
}
|
||||
|
||||
query := `INSERT INTO site_info (host, visit_count, last_visit_time, fingerprint, success_rate,
|
||||
html_structure, ips, quality, https_available, keywords, out_links,
|
||||
languages, redirects, server_types)
|
||||
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE
|
||||
visit_count = VALUES(visit_count),
|
||||
last_visit_time = VALUES(last_visit_time),
|
||||
fingerprint = VALUES(fingerprint),
|
||||
success_rate = VALUES(success_rate),
|
||||
html_structure = VALUES(html_structure),
|
||||
ips = VALUES(ips),
|
||||
quality = VALUES(quality),
|
||||
https_available = VALUES(https_available),
|
||||
keywords = VALUES(keywords),
|
||||
out_links = VALUES(out_links),
|
||||
languages = VALUES(languages),
|
||||
redirects = VALUES(redirects),
|
||||
server_types = VALUES(server_types)`
|
||||
|
||||
tx, err := DB.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][sites] begin tx error: %v", err)
|
||||
return
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.PrepareContext(ctx, query)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][sites] prepare error: %v", err)
|
||||
return
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, key := range keys {
|
||||
data, err := f.redisDB.HGetAll(ctx, key).Result()
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
host := strings.TrimPrefix(key, "site:")
|
||||
|
||||
visitCount := int(parseInt64(data["visit_count"]))
|
||||
lastVisitTime := parseInt64(data["last_visit_time"])
|
||||
fingerprint := data["fingerprint"]
|
||||
htmlStructure := data["html_structure"]
|
||||
|
||||
var successRate *float64
|
||||
if v := data["success_rate"]; v != "" {
|
||||
f := parseFloat(v)
|
||||
successRate = &f
|
||||
}
|
||||
|
||||
var quality *float64
|
||||
if v := data["quality"]; v != "" {
|
||||
q := parseFloat(v)
|
||||
quality = &q
|
||||
}
|
||||
|
||||
var httpsAvailable *int8
|
||||
if v := data["https_available"]; v != "" {
|
||||
i := int8(parseInt64(v))
|
||||
httpsAvailable = &i
|
||||
}
|
||||
|
||||
// JSON 字段:空字符串转为 NULL 或 "[]"
|
||||
// MySQL JSON 类型不接受空字符串
|
||||
ips := data["ips"]
|
||||
if ips == "" {
|
||||
ips = "[]"
|
||||
}
|
||||
keywords := data["keywords"]
|
||||
if keywords == "" {
|
||||
keywords = "[]"
|
||||
}
|
||||
outLinks := data["out_links"]
|
||||
if outLinks == "" {
|
||||
outLinks = "[]"
|
||||
}
|
||||
languages := data["languages"]
|
||||
if languages == "" {
|
||||
languages = "[]"
|
||||
}
|
||||
redirects := data["redirects"]
|
||||
if redirects == "" {
|
||||
redirects = "[]"
|
||||
}
|
||||
serverTypes := data["server_types"]
|
||||
if serverTypes == "" {
|
||||
serverTypes = "[]"
|
||||
}
|
||||
|
||||
_, err = stmt.ExecContext(ctx, host, visitCount, lastVisitTime, fingerprint, successRate,
|
||||
htmlStructure, ips, quality, httpsAvailable, keywords, outLinks,
|
||||
languages, redirects, serverTypes)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][sites] exec error for %s: %v", host, err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
log.Printf("[mysql-flusher][sites] commit error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// flushIndex 将 Redis idx:* 数据刷到 index_entries 表
|
||||
func (f *Flusher) flushIndex() {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
total := 0
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := f.redisDB.Scan(ctx, cursor, "idx:*", int64(f.batchSize)).Result()
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][index] scan error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(keys) > 0 {
|
||||
f.batchUpsertIndex(ctx, keys)
|
||||
total += len(keys)
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if total > 0 {
|
||||
log.Printf("[mysql-flusher][index] flushed %d keywords", total)
|
||||
}
|
||||
}
|
||||
|
||||
// batchUpsertIndex 批量 upsert index_entries
|
||||
func (f *Flusher) batchUpsertIndex(ctx context.Context, keys []string) {
|
||||
if len(keys) == 0 || DB == nil {
|
||||
return
|
||||
}
|
||||
|
||||
tx, err := DB.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][index] begin tx error: %v", err)
|
||||
return
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.PrepareContext(ctx, `INSERT INTO index_entries (keyword, url, weight)
|
||||
VALUES (?, ?, ?)
|
||||
ON DUPLICATE KEY UPDATE weight = VALUES(weight)`)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][index] prepare error: %v", err)
|
||||
return
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, key := range keys {
|
||||
keyword := strings.TrimPrefix(key, "idx:")
|
||||
|
||||
// 获取有序集合中的所有成员
|
||||
entries, err := f.redisDB.ZRevRangeWithScores(ctx, key, 0, -1).Result()
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][index] zrange error for %s: %v", keyword, err)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, entry := range entries {
|
||||
url := entry.Member.(string)
|
||||
weight := float32(entry.Score)
|
||||
_, err = stmt.ExecContext(ctx, keyword, url, weight)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][index] exec error for %s/%s: %v", keyword, url, err)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
log.Printf("[mysql-flusher][index] commit error: %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
// flushPriorityURLs 将 Redis priority:* 数据刷到 priority_urls 表
|
||||
func (f *Flusher) flushPriorityURLs() {
|
||||
ctx := context.Background()
|
||||
|
||||
keys, err := f.redisDB.Keys(ctx, "priority:*").Result()
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][priority] keys error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
if len(keys) == 0 || DB == nil {
|
||||
return
|
||||
}
|
||||
|
||||
tx, err := DB.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][priority] begin tx error: %v", err)
|
||||
return
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
stmt, err := tx.PrepareContext(ctx, `INSERT IGNORE INTO priority_urls (url) VALUES (?)`)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][priority] prepare error: %v", err)
|
||||
return
|
||||
}
|
||||
defer stmt.Close()
|
||||
|
||||
for _, key := range keys {
|
||||
url := strings.TrimPrefix(key, "priority:")
|
||||
_, err = stmt.ExecContext(ctx, url)
|
||||
if err != nil {
|
||||
log.Printf("[mysql-flusher][priority] exec error for %s: %v", url, err)
|
||||
}
|
||||
}
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
log.Printf("[mysql-flusher][priority] commit error: %v", err)
|
||||
return
|
||||
}
|
||||
|
||||
log.Printf("[mysql-flusher][priority] flushed %d entries", len(keys))
|
||||
}
|
||||
|
||||
// ============================================
|
||||
// 辅助函数
|
||||
// ============================================
|
||||
|
||||
func parseInt64(s string) int64 {
|
||||
var v int64
|
||||
fmt.Sscanf(s, "%d", &v)
|
||||
return v
|
||||
}
|
||||
|
||||
func parseFloat(s string) float64 {
|
||||
var v float64
|
||||
fmt.Sscanf(s, "%f", &v)
|
||||
return v
|
||||
}
|
||||
@@ -0,0 +1,98 @@
|
||||
-- ============================================
|
||||
-- sese-engine MySQL 初始化脚本
|
||||
-- 自动创建数据库(如果不存在)并创建表结构
|
||||
-- ============================================
|
||||
|
||||
-- 创建数据库(如果不存在)
|
||||
CREATE DATABASE IF NOT EXISTS sese_engine DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
CREATE DATABASE IF NOT EXISTS sese_test2 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
|
||||
|
||||
-- ============================================
|
||||
-- 倒排索引表
|
||||
-- 存储关键词到URL的映射及其权重
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS index_entries (
|
||||
keyword VARCHAR(255) NOT NULL COMMENT '关键词',
|
||||
url VARCHAR(2048) NOT NULL COMMENT 'URL地址',
|
||||
weight FLOAT NOT NULL COMMENT '权重分数',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
PRIMARY KEY (keyword(255), url(255)),
|
||||
INDEX idx_keyword (keyword),
|
||||
INDEX idx_weight (weight),
|
||||
INDEX idx_url (url(255))
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='倒排索引表';
|
||||
|
||||
-- ============================================
|
||||
-- URL 摘要表
|
||||
-- 存储每个URL的标题、描述、正文片段等
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS url_snippets (
|
||||
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
|
||||
url VARCHAR(2048) NOT NULL COMMENT 'URL地址(唯一)',
|
||||
url_hash VARCHAR(64) NOT NULL COMMENT 'URL的MD5哈希(用于快速查询)',
|
||||
title VARCHAR(512) COMMENT '页面标题',
|
||||
description TEXT COMMENT 'meta description或自动生成的描述',
|
||||
text MEDIUMTEXT COMMENT '正文文本片段',
|
||||
timestamp BIGINT COMMENT '抓取时间戳',
|
||||
content_hash VARCHAR(64) COMMENT '内容哈希(用于增量检测)',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间',
|
||||
UNIQUE KEY uk_url (url(255)),
|
||||
INDEX idx_url_hash (url_hash),
|
||||
INDEX idx_timestamp (timestamp)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='URL摘要缓存表';
|
||||
|
||||
-- ============================================
|
||||
-- 网站信息表
|
||||
-- 存储每个域名的元信息
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS site_info (
|
||||
host VARCHAR(255) PRIMARY KEY COMMENT '域名/主机名',
|
||||
visit_count INT UNSIGNED DEFAULT 0 COMMENT '访问次数',
|
||||
last_visit_time BIGINT COMMENT '最后访问时间戳',
|
||||
fingerprint TEXT COMMENT '网站指纹',
|
||||
success_rate FLOAT COMMENT '成功率',
|
||||
html_structure TEXT COMMENT 'HTML结构特征',
|
||||
ips JSON COMMENT 'IP地址列表',
|
||||
quality FLOAT COMMENT '质量评分',
|
||||
https_available TINYINT COMMENT '是否支持HTTPS(1=是,0=否)',
|
||||
keywords JSON COMMENT '高频关键词列表',
|
||||
out_links JSON COMMENT '出站链接列表',
|
||||
languages JSON COMMENT '语种分布',
|
||||
redirects JSON COMMENT '重定向链',
|
||||
server_types JSON COMMENT 'Server类型',
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间',
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP COMMENT '更新时间'
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='网站元信息表';
|
||||
|
||||
-- ============================================
|
||||
-- 刷盘记录表
|
||||
-- 用于断点续传,记录刷盘进度
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS flush_marker (
|
||||
id BIGINT UNSIGNED AUTO_INCREMENT PRIMARY KEY,
|
||||
flush_type VARCHAR(50) NOT NULL COMMENT '刷盘类型:index, gate, site',
|
||||
last_key VARCHAR(255) COMMENT '最后处理的key',
|
||||
last_cursor BIGINT DEFAULT 0 COMMENT 'Redis SCAN游标',
|
||||
processed_count INT UNSIGNED DEFAULT 0 COMMENT '本批次处理数量',
|
||||
flush_time TIMESTAMP DEFAULT CURRENT_TIMESTAMP COMMENT '刷盘时间',
|
||||
INDEX idx_type (flush_type),
|
||||
INDEX idx_time (flush_time)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='刷盘进度记录表';
|
||||
|
||||
-- ============================================
|
||||
-- 优先爬取URL表
|
||||
-- 存储需要优先爬取的URL
|
||||
-- ============================================
|
||||
CREATE TABLE IF NOT EXISTS priority_urls (
|
||||
url VARCHAR(2048) NOT NULL,
|
||||
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
PRIMARY KEY (url(255)),
|
||||
INDEX idx_created (created_at)
|
||||
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci
|
||||
COMMENT='优先爬取URL表';
|
||||
+416
@@ -0,0 +1,416 @@
|
||||
// Package mysql 提供 MySQL 数据库连接和管理功能。
|
||||
// 支持 Unix Socket 和 TCP 两种连接方式,自动初始化数据表和恢复数据。
|
||||
package mysql
|
||||
|
||||
import (
|
||||
"context"
|
||||
"database/sql"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
goredis "github.com/redis/go-redis/v9"
|
||||
_ "github.com/go-sql-driver/mysql"
|
||||
"sese-engine/config"
|
||||
)
|
||||
|
||||
// DB 是 MySQL 数据库连接池
|
||||
var DB *sql.DB
|
||||
|
||||
// Open 初始化 MySQL 连接
|
||||
// 根据配置自动选择 Unix Socket 或 TCP 连接
|
||||
func Open() error {
|
||||
dsn := config.MySQLDSN()
|
||||
|
||||
db, err := sql.Open("mysql", dsn)
|
||||
if err != nil {
|
||||
return fmt.Errorf("mysql.Open: %w", err)
|
||||
}
|
||||
|
||||
// 配置连接池
|
||||
db.SetConnMaxLifetime(time.Duration(config.MySQLConnMaxLifetime()) * time.Second)
|
||||
db.SetMaxIdleConns(config.MySQLMaxIdleConns())
|
||||
db.SetMaxOpenConns(config.MySQLMaxOpenConns())
|
||||
|
||||
// 验证连接
|
||||
if err := db.Ping(); err != nil {
|
||||
return fmt.Errorf("mysql.Ping: %w", err)
|
||||
}
|
||||
|
||||
DB = db
|
||||
log.Printf("[mysql] connected via %s", formatDSN(dsn))
|
||||
|
||||
// 自动初始化数据表
|
||||
if err := initSchema(); err != nil {
|
||||
return fmt.Errorf("mysql init schema: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// initSchema 自动执行 init_db.sql 初始化数据表
|
||||
func initSchema() error {
|
||||
// 查找 init_db.sql 文件
|
||||
execPath, err := os.Executable()
|
||||
if err != nil {
|
||||
execPath = os.Args[0]
|
||||
}
|
||||
sqlFile := filepath.Join(filepath.Dir(execPath), "mysql", "init_db.sql")
|
||||
if _, err := os.Stat(sqlFile); os.IsNotExist(err) {
|
||||
// 尝试从当前工作目录查找
|
||||
cwd, _ := os.Getwd()
|
||||
sqlFile = filepath.Join(cwd, "mysql", "init_db.sql")
|
||||
}
|
||||
|
||||
data, err := os.ReadFile(sqlFile)
|
||||
if err != nil {
|
||||
return fmt.Errorf("read init_db.sql: %w", err)
|
||||
}
|
||||
|
||||
// 获取配置的数据库名
|
||||
dbName := config.Global.MySQL.Database
|
||||
if dbName == "" {
|
||||
dbName = "sese_engine"
|
||||
}
|
||||
log.Printf("[mysql] init schema: database=%s", dbName)
|
||||
|
||||
// 先切换到目标数据库
|
||||
if _, err := DB.Exec("USE " + dbName); err != nil {
|
||||
return fmt.Errorf("mysql USE database: %w", err)
|
||||
}
|
||||
|
||||
// 分割 SQL 语句(按分号分割)
|
||||
statements := splitStatements(string(data))
|
||||
log.Printf("[mysql] found %d SQL statements to execute", len(statements))
|
||||
|
||||
execed := 0
|
||||
for i, stmt := range statements {
|
||||
trimmed := strings.TrimSpace(stmt)
|
||||
// 跳过空行和注释
|
||||
if trimmed == "" || strings.HasPrefix(trimmed, "--") || strings.HasPrefix(trimmed, "/*") {
|
||||
log.Printf("[mysql] [%d/%d] SKIP (empty/comment): %s", i+1, len(statements), truncate(trimmed, 60))
|
||||
continue
|
||||
}
|
||||
if _, err := DB.Exec(trimmed); err != nil {
|
||||
log.Printf("[mysql] [%d/%d] FAILED: %v\n SQL: %s", i+1, len(statements), err, truncate(trimmed, 200))
|
||||
continue
|
||||
}
|
||||
execed++
|
||||
log.Printf("[mysql] [%d/%d] OK: %s", i+1, len(statements), truncate(trimmed, 60))
|
||||
}
|
||||
|
||||
log.Printf("[mysql] init schema done, executed=%d statements", execed)
|
||||
return nil
|
||||
}
|
||||
|
||||
// splitStatements 按分号分割 SQL 语句(处理多行 CREATE TABLE)
|
||||
func splitStatements(sql string) []string {
|
||||
var statements []string
|
||||
var buf strings.Builder
|
||||
inComment := false
|
||||
|
||||
for _, line := range strings.Split(sql, "\n") {
|
||||
trimmed := strings.TrimSpace(line)
|
||||
|
||||
// 单行注释
|
||||
if strings.HasPrefix(trimmed, "--") || strings.HasPrefix(trimmed, "//") {
|
||||
continue
|
||||
}
|
||||
|
||||
// 多行注释开始/结束
|
||||
if strings.Contains(trimmed, "/*") {
|
||||
inComment = true
|
||||
}
|
||||
if inComment {
|
||||
if strings.Contains(trimmed, "*/") {
|
||||
inComment = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
|
||||
// 空行跳过
|
||||
if trimmed == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
buf.WriteString(line)
|
||||
buf.WriteString("\n")
|
||||
|
||||
// 检查是否以分号结尾
|
||||
trimmed = strings.TrimSpace(buf.String())
|
||||
if strings.HasSuffix(trimmed, ";") {
|
||||
statements = append(statements, trimmed)
|
||||
buf.Reset()
|
||||
}
|
||||
}
|
||||
|
||||
// 处理最后一条(可能没有分号)
|
||||
if buf.Len() > 0 {
|
||||
trimmed := strings.TrimSpace(buf.String())
|
||||
if trimmed != "" {
|
||||
statements = append(statements, trimmed)
|
||||
}
|
||||
}
|
||||
|
||||
return statements
|
||||
}
|
||||
|
||||
// truncate 截断字符串
|
||||
func truncate(s string, maxLen int) string {
|
||||
if len(s) <= maxLen {
|
||||
return s
|
||||
}
|
||||
return s[:maxLen] + "..."
|
||||
}
|
||||
|
||||
// Close 关闭 MySQL 连接
|
||||
func Close() error {
|
||||
if DB != nil {
|
||||
return DB.Close()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// Ping 检查 MySQL 连接是否正常
|
||||
func Ping() error {
|
||||
if DB == nil {
|
||||
return fmt.Errorf("mysql not initialized")
|
||||
}
|
||||
return DB.Ping()
|
||||
}
|
||||
|
||||
// formatDSN 格式化 DSN 用于日志(隐藏密码)
|
||||
func formatDSN(dsn string) string {
|
||||
// 简化日志输出
|
||||
cfg := config.Global.MySQL
|
||||
if cfg.UnixSocket != "" {
|
||||
return fmt.Sprintf("unix_socket=%s database=%s", cfg.UnixSocket, cfg.Database)
|
||||
}
|
||||
return fmt.Sprintf("tcp=%s:%d database=%s", cfg.Host, cfg.Port, cfg.Database)
|
||||
}
|
||||
|
||||
// RestoreFromMySQLToRedis 从 MySQL 恢复数据到 Redis
|
||||
// 用于 Redis 数据丢失后重建索引
|
||||
func RestoreFromMySQLToRedis(redisDB *goredis.Client) error {
|
||||
if DB == nil {
|
||||
return fmt.Errorf("mysql not initialized")
|
||||
}
|
||||
|
||||
start := time.Now()
|
||||
log.Printf("[mysql-restore] starting restoration from MySQL to Redis...")
|
||||
|
||||
ctx := context.Background()
|
||||
|
||||
// 1. 恢复 index_entries → Redis idx:* ZSet
|
||||
if err := restoreIndexEntries(ctx, redisDB); err != nil {
|
||||
return fmt.Errorf("restore index_entries: %w", err)
|
||||
}
|
||||
|
||||
// 2. 恢复 url_snippets → Redis gate:* + url2hash:*
|
||||
if err := restoreUrlSnippets(ctx, redisDB); err != nil {
|
||||
return fmt.Errorf("restore url_snippets: %w", err)
|
||||
}
|
||||
|
||||
// 3. 恢复 site_info → Redis site:*
|
||||
if err := restoreSiteInfo(ctx, redisDB); err != nil {
|
||||
return fmt.Errorf("restore site_info: %w", err)
|
||||
}
|
||||
|
||||
// 4. 恢复 priority_urls → Redis priority:*
|
||||
if err := restorePriorityURLs(ctx, redisDB); err != nil {
|
||||
return fmt.Errorf("restore priority_urls: %w", err)
|
||||
}
|
||||
|
||||
log.Printf("[mysql-restore] restoration completed in %v", time.Since(start))
|
||||
return nil
|
||||
}
|
||||
|
||||
// restoreIndexEntries 恢复倒排索引
|
||||
func restoreIndexEntries(ctx context.Context, redisDB *goredis.Client) error {
|
||||
rows, err := DB.Query("SELECT keyword, url, weight FROM index_entries")
|
||||
if err != nil {
|
||||
// 表不存在时跳过
|
||||
log.Printf("[mysql-restore][index] skip: %v", err)
|
||||
return nil
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
// 按 keyword 分组
|
||||
type indexRow struct {
|
||||
URL string
|
||||
Weight float32
|
||||
}
|
||||
keywordMap := make(map[string][]indexRow)
|
||||
count := 0
|
||||
|
||||
for rows.Next() {
|
||||
var keyword, url string
|
||||
var weight float32
|
||||
if err := rows.Scan(&keyword, &url, &weight); err != nil {
|
||||
continue
|
||||
}
|
||||
keywordMap[keyword] = append(keywordMap[keyword], indexRow{URL: url, Weight: weight})
|
||||
count++
|
||||
}
|
||||
|
||||
// 批量写入 Redis
|
||||
for keyword, entries := range keywordMap {
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
zSlice := make([]goredis.Z, len(entries))
|
||||
for i, e := range entries {
|
||||
zSlice[i] = goredis.Z{Score: float64(e.Weight), Member: e.URL}
|
||||
}
|
||||
if err := redisDB.ZAdd(ctx, "idx:"+keyword, zSlice...).Err(); err != nil {
|
||||
log.Printf("[mysql-restore][index] failed to restore %s: %v", keyword, err)
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[mysql-restore][index] restored %d entries (%d keywords)", count, len(keywordMap))
|
||||
return nil
|
||||
}
|
||||
|
||||
// restoreUrlSnippets 恢复 URL 摘要
|
||||
func restoreUrlSnippets(ctx context.Context, redisDB *goredis.Client) error {
|
||||
rows, err := DB.Query("SELECT url, url_hash, title, description, text, timestamp, content_hash FROM url_snippets")
|
||||
if err != nil {
|
||||
// 表不存在时跳过
|
||||
log.Printf("[mysql-restore][snippets] skip: %v", err)
|
||||
return nil
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
count := 0
|
||||
for rows.Next() {
|
||||
var url, urlHash, title, description, text, contentHash sql.NullString
|
||||
var timestamp sql.NullInt64
|
||||
if err := rows.Scan(&url, &urlHash, &title, &description, &text, ×tamp, &contentHash); err != nil {
|
||||
continue
|
||||
}
|
||||
if !url.Valid || urlHash.Valid == false {
|
||||
continue
|
||||
}
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"url": url.String,
|
||||
"title": nullString(title),
|
||||
"desc": nullString(description),
|
||||
"text": nullString(text),
|
||||
"ts": nullInt64(timestamp),
|
||||
"hash": nullString(contentHash),
|
||||
}
|
||||
|
||||
if err := redisDB.HMSet(ctx, "gate:"+urlHash.String, fields).Err(); err != nil {
|
||||
continue
|
||||
}
|
||||
// 同时写入 URL→hash 映射
|
||||
redisDB.Set(ctx, "url2hash:"+url.String, urlHash.String, 0)
|
||||
count++
|
||||
}
|
||||
|
||||
log.Printf("[mysql-restore][snippets] restored %d entries", count)
|
||||
return nil
|
||||
}
|
||||
|
||||
// restoreSiteInfo 恢复网站信息
|
||||
func restoreSiteInfo(ctx context.Context, redisDB *goredis.Client) error {
|
||||
rows, err := DB.Query("SELECT host, visit_count, last_visit_time, success_rate, https_available FROM site_info")
|
||||
if err != nil {
|
||||
// 表不存在时跳过
|
||||
log.Printf("[mysql-restore][site] skip: %v", err)
|
||||
return nil
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
count := 0
|
||||
for rows.Next() {
|
||||
var host string
|
||||
var visitCount sql.NullInt64
|
||||
var lastVisitTime sql.NullInt64
|
||||
var successRate sql.NullFloat64
|
||||
var httpsAvailable sql.NullInt64
|
||||
|
||||
if err := rows.Scan(&host, &visitCount, &lastVisitTime, &successRate, &httpsAvailable); err != nil {
|
||||
continue
|
||||
}
|
||||
if host == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"visit_count": nullInt64(visitCount),
|
||||
"last_visit_time": nullInt64(lastVisitTime),
|
||||
}
|
||||
if successRate.Valid {
|
||||
fields["success_rate"] = successRate.Float64
|
||||
}
|
||||
if httpsAvailable.Valid {
|
||||
fields["https_available"] = httpsAvailable.Int64
|
||||
}
|
||||
|
||||
if err := redisDB.HMSet(ctx, "site:"+host, fields).Err(); err != nil {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
}
|
||||
|
||||
log.Printf("[mysql-restore][site] restored %d entries", count)
|
||||
return nil
|
||||
}
|
||||
|
||||
// restorePriorityURLs 恢复优先 URL
|
||||
func restorePriorityURLs(ctx context.Context, redisDB *goredis.Client) error {
|
||||
rows, err := DB.Query("SELECT url FROM priority_urls")
|
||||
if err != nil {
|
||||
// 表不存在时跳过
|
||||
log.Printf("[mysql-restore][priority] skip: %v", err)
|
||||
return nil
|
||||
}
|
||||
defer rows.Close()
|
||||
|
||||
count := 0
|
||||
for rows.Next() {
|
||||
var url string
|
||||
if err := rows.Scan(&url); err != nil {
|
||||
continue
|
||||
}
|
||||
if url == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"url": url,
|
||||
"is_domain": "0",
|
||||
"added_at": time.Now().Unix(),
|
||||
"visited": "0",
|
||||
}
|
||||
|
||||
if err := redisDB.HMSet(ctx, "priority:"+url, fields).Err(); err != nil {
|
||||
continue
|
||||
}
|
||||
count++
|
||||
}
|
||||
|
||||
log.Printf("[mysql-restore][priority] restored %d entries", count)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- 辅助函数 ----
|
||||
|
||||
func nullString(v sql.NullString) string {
|
||||
if v.Valid {
|
||||
return v.String
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func nullInt64(v sql.NullInt64) int64 {
|
||||
if v.Valid {
|
||||
return v.Int64
|
||||
}
|
||||
return 0
|
||||
}
|
||||
Reference in New Issue
Block a user