修改成redis
This commit is contained in:
@@ -0,0 +1,651 @@
|
||||
// Package storage provides unified storage interface backed by Redis + MySQL.
|
||||
// Redis is used as the primary high-performance store, MySQL is used for persistence.
|
||||
package storage
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/md5"
|
||||
"fmt"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
goredis "github.com/redis/go-redis/v9"
|
||||
extredis "sese-engine/redis"
|
||||
)
|
||||
|
||||
// PriorityEntry 记录一条待优先爬取的 URL 或域名。
|
||||
type PriorityEntry struct {
|
||||
URL string `json:"url"` // 用户提交的 URL 或域名(会自动规范化为带 scheme 的 URL)
|
||||
IsDomain bool `json:"domain"` // 是否为纯域名(true=仅域名,false=完整 URL)
|
||||
AddedAt int64 `json:"added_at"` // 添加时的 Unix 时间戳
|
||||
Visited bool `json:"visited"` // 是否已爬取(crawler 爬完后标记)
|
||||
}
|
||||
|
||||
// Store 统一存储接口,所有数据操作都通过此接口。
|
||||
type Store interface {
|
||||
// 倒排索引操作
|
||||
GetIndex(keyword string) ([]IndexEntry, error)
|
||||
BatchSetIndex(batch map[string][]IndexEntry) error
|
||||
ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error
|
||||
|
||||
// URL 摘要操作
|
||||
GetSnippet(url string) (*SnippetEntry, error)
|
||||
SetSnippet(url string, entry *SnippetEntry) error
|
||||
ForEachSnippet(fn func(url string, entry *SnippetEntry) error) error
|
||||
|
||||
// 网站信息操作
|
||||
GetSiteInfo(host string) (*SiteInfo, error)
|
||||
SetSiteInfo(host string, info *SiteInfo) error
|
||||
UpdateSiteInfo(host string, fn func(*SiteInfo)) error
|
||||
ForEachSite(fn func(host string, info *SiteInfo) error) error
|
||||
|
||||
// Priority URL 操作
|
||||
GetPriorityURLs() ([]PriorityEntry, error)
|
||||
AddPriorityURL(entry PriorityEntry) error
|
||||
RemovePriorityURL(url string) error
|
||||
MarkPriorityURLVisited(url string) error
|
||||
ClearVisitedPriorityURLs() error
|
||||
|
||||
// 生命周期
|
||||
Close() error
|
||||
}
|
||||
|
||||
// RedisStore 实现了 Store 接口,提供 Redis 存储能力。
|
||||
type RedisStoreV2 struct {
|
||||
client *goredis.Client
|
||||
|
||||
// 内存索引聚合(用于写入)
|
||||
mem map[string][]IndexEntry
|
||||
memMu sync.RWMutex
|
||||
rowCount int64
|
||||
|
||||
// 站点信息内存缓存
|
||||
siteCache map[string]*SiteInfo
|
||||
siteCacheMu sync.RWMutex
|
||||
}
|
||||
|
||||
// NewRedisStoreV2 创建新的 Redis 存储实例
|
||||
func NewRedisStoreV2() *RedisStoreV2 {
|
||||
return &RedisStoreV2{
|
||||
mem: make(map[string][]IndexEntry),
|
||||
siteCache: make(map[string]*SiteInfo),
|
||||
}
|
||||
}
|
||||
|
||||
// Init 初始化 Redis 存储
|
||||
func (r *RedisStoreV2) Init() error {
|
||||
if extredis.Client == nil {
|
||||
return fmt.Errorf("redis not initialized, call redis.Open() first")
|
||||
}
|
||||
r.client = extredis.Client
|
||||
return nil
|
||||
}
|
||||
|
||||
// Close 关闭存储
|
||||
func (r *RedisStoreV2) Close() error {
|
||||
// 将内存数据刷到 Redis
|
||||
if err := r.FlushMemToRedis(); err != nil {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- 倒排索引操作 ----
|
||||
|
||||
// GetIndex 获取关键词的倒排索引
|
||||
func (r *RedisStoreV2) GetIndex(keyword string) ([]IndexEntry, error) {
|
||||
ctx := context.Background()
|
||||
|
||||
entries, err := r.client.ZRevRangeWithScores(ctx, "idx:"+keyword, 0, -1).Result()
|
||||
if err != nil {
|
||||
if err == goredis.Nil {
|
||||
return nil, nil
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
|
||||
result := make([]IndexEntry, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
result = append(result, IndexEntry{
|
||||
Weight: float32(e.Score),
|
||||
URL: e.Member.(string),
|
||||
})
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// BatchSetIndex 批量设置倒排索引
|
||||
func (r *RedisStoreV2) BatchSetIndex(batch map[string][]IndexEntry) error {
|
||||
ctx := context.Background()
|
||||
|
||||
for keyword, entries := range batch {
|
||||
if len(entries) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// 先删除旧的
|
||||
r.client.Del(ctx, "idx:"+keyword)
|
||||
|
||||
// 添加新的
|
||||
if len(entries) > 0 {
|
||||
zSlice := make([]goredis.Z, len(entries))
|
||||
for i, e := range entries {
|
||||
zSlice[i] = goredis.Z{
|
||||
Score: float64(e.Weight),
|
||||
Member: e.URL,
|
||||
}
|
||||
}
|
||||
if err := r.client.ZAdd(ctx, "idx:"+keyword, zSlice...).Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ForEachIndex 遍历所有倒排索引
|
||||
func (r *RedisStoreV2) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := r.client.Scan(ctx, cursor, "idx:*", 1000).Result()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, key := range keys {
|
||||
keyword := key[4:] // 去掉 "idx:" 前缀
|
||||
entries, err := r.GetIndex(keyword)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if err := fn(keyword, entries); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- URL 摘要操作 ----
|
||||
|
||||
// GetSnippet 获取 URL 摘要
|
||||
func (r *RedisStoreV2) GetSnippet(url string) (*SnippetEntry, error) {
|
||||
ctx := context.Background()
|
||||
hash := urlHash(url)
|
||||
|
||||
data, err := r.client.HGetAll(ctx, "gate:"+hash).Result()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
return &SnippetEntry{
|
||||
Title: data["title"],
|
||||
Description: data["desc"],
|
||||
Text: data["text"],
|
||||
Timestamp: parseInt64(data["ts"]),
|
||||
ContentHash: data["hash"],
|
||||
}, nil
|
||||
}
|
||||
|
||||
// SetSnippet 设置 URL 摘要
|
||||
func (r *RedisStoreV2) SetSnippet(url string, entry *SnippetEntry) error {
|
||||
ctx := context.Background()
|
||||
hash := urlHash(url)
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"url": url,
|
||||
"title": entry.Title,
|
||||
"desc": entry.Description,
|
||||
"text": entry.Text,
|
||||
"ts": entry.Timestamp,
|
||||
"hash": entry.ContentHash,
|
||||
}
|
||||
|
||||
err := r.client.HMSet(ctx, "gate:"+hash, fields).Err()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 同时存储 URL→hash 的映射
|
||||
r.client.Set(ctx, "url2hash:"+url, hash, 0)
|
||||
return nil
|
||||
}
|
||||
|
||||
// ForEachSnippet 遍历所有 URL 摘要
|
||||
func (r *RedisStoreV2) ForEachSnippet(fn func(url string, entry *SnippetEntry) error) error {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := r.client.Scan(ctx, cursor, "gate:*", 1000).Result()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, key := range keys {
|
||||
data, err := r.client.HGetAll(ctx, key).Result()
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
entry := &SnippetEntry{
|
||||
Title: data["title"],
|
||||
Description: data["desc"],
|
||||
Text: data["text"],
|
||||
Timestamp: parseInt64(data["ts"]),
|
||||
ContentHash: data["hash"],
|
||||
}
|
||||
|
||||
if err := fn(data["url"], entry); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- 网站信息操作 ----
|
||||
|
||||
// GetSiteInfo 获取网站信息
|
||||
func (r *RedisStoreV2) GetSiteInfo(host string) (*SiteInfo, error) {
|
||||
// 先从内存缓存读取
|
||||
r.siteCacheMu.RLock()
|
||||
if info, ok := r.siteCache[host]; ok {
|
||||
r.siteCacheMu.RUnlock()
|
||||
return info, nil
|
||||
}
|
||||
r.siteCacheMu.RUnlock()
|
||||
|
||||
// 从 Redis 读取
|
||||
ctx := context.Background()
|
||||
data, err := r.client.HGetAll(ctx, "site:"+host).Result()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if len(data) == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
info := &SiteInfo{
|
||||
VisitCount: int(parseInt64(data["visit_count"])),
|
||||
LastVisitTime: parseInt64(data["last_visit_time"]),
|
||||
}
|
||||
|
||||
if v, ok := data["success_rate"]; ok {
|
||||
f := parseFloat64(v)
|
||||
info.SuccessRate = &f
|
||||
}
|
||||
if v, ok := data["https_available"]; ok {
|
||||
b := parseInt64(v) == 1
|
||||
info.HTTPSAvailable = &b
|
||||
}
|
||||
|
||||
// 回填缓存
|
||||
r.siteCacheMu.Lock()
|
||||
r.siteCache[host] = info
|
||||
r.siteCacheMu.Unlock()
|
||||
|
||||
return info, nil
|
||||
}
|
||||
|
||||
// SetSiteInfo 设置网站信息
|
||||
func (r *RedisStoreV2) SetSiteInfo(host string, info *SiteInfo) error {
|
||||
ctx := context.Background()
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"visit_count": info.VisitCount,
|
||||
"last_visit_time": info.LastVisitTime,
|
||||
}
|
||||
|
||||
if info.SuccessRate != nil {
|
||||
fields["success_rate"] = *info.SuccessRate
|
||||
}
|
||||
if info.HTTPSAvailable != nil {
|
||||
if *info.HTTPSAvailable {
|
||||
fields["https_available"] = 1
|
||||
} else {
|
||||
fields["https_available"] = 0
|
||||
}
|
||||
}
|
||||
|
||||
err := r.client.HMSet(ctx, "site:"+host, fields).Err()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// 更新缓存
|
||||
r.siteCacheMu.Lock()
|
||||
r.siteCache[host] = info
|
||||
r.siteCacheMu.Unlock()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// UpdateSiteInfo 更新网站信息
|
||||
func (r *RedisStoreV2) UpdateSiteInfo(host string, fn func(*SiteInfo)) error {
|
||||
info, err := r.GetSiteInfo(host)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if info == nil {
|
||||
info = &SiteInfo{}
|
||||
}
|
||||
|
||||
// 调用更新函数
|
||||
fn(info)
|
||||
|
||||
return r.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
// ForEachSite 遍历所有网站信息
|
||||
func (r *RedisStoreV2) ForEachSite(fn func(host string, info *SiteInfo) error) error {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := r.client.Scan(ctx, cursor, "site:*", 1000).Result()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, key := range keys {
|
||||
host := key[5:] // 去掉 "site:" 前缀
|
||||
info, err := r.GetSiteInfo(host)
|
||||
if err != nil || info == nil {
|
||||
continue
|
||||
}
|
||||
if err := fn(host, info); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- Priority URL 操作 ----
|
||||
|
||||
// GetPriorityURLs 获取所有未访问的 Priority URL
|
||||
func (r *RedisStoreV2) GetPriorityURLs() ([]PriorityEntry, error) {
|
||||
ctx := context.Background()
|
||||
|
||||
keys, err := r.client.Keys(ctx, "priority:*").Result()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var entries []PriorityEntry
|
||||
for _, key := range keys {
|
||||
data, err := r.client.HGetAll(ctx, key).Result()
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
visited := data["visited"] == "1"
|
||||
if visited {
|
||||
continue
|
||||
}
|
||||
|
||||
entry := PriorityEntry{
|
||||
URL: data["url"],
|
||||
IsDomain: data["is_domain"] == "1",
|
||||
AddedAt: parseInt64(data["added_at"]),
|
||||
Visited: visited,
|
||||
}
|
||||
entries = append(entries, entry)
|
||||
}
|
||||
|
||||
return entries, nil
|
||||
}
|
||||
|
||||
// AddPriorityURL 添加 Priority URL
|
||||
func (r *RedisStoreV2) AddPriorityURL(entry PriorityEntry) error {
|
||||
ctx := context.Background()
|
||||
|
||||
fields := map[string]interface{}{
|
||||
"url": entry.URL,
|
||||
"is_domain": boolToStr(entry.IsDomain),
|
||||
"added_at": entry.AddedAt,
|
||||
"visited": boolToStr(entry.Visited),
|
||||
}
|
||||
|
||||
key := "priority:" + entry.URL
|
||||
return r.client.HMSet(ctx, key, fields).Err()
|
||||
}
|
||||
|
||||
// RemovePriorityURL 删除 Priority URL
|
||||
func (r *RedisStoreV2) RemovePriorityURL(url string) error {
|
||||
ctx := context.Background()
|
||||
return r.client.Del(ctx, "priority:"+url).Err()
|
||||
}
|
||||
|
||||
// MarkPriorityURLVisited 标记 Priority URL 为已访问
|
||||
func (r *RedisStoreV2) MarkPriorityURLVisited(url string) error {
|
||||
ctx := context.Background()
|
||||
return r.client.HSet(ctx, "priority:"+url, "visited", "1").Err()
|
||||
}
|
||||
|
||||
// ClearVisitedPriorityURLs 清除已访问的 Priority URL
|
||||
func (r *RedisStoreV2) ClearVisitedPriorityURLs() error {
|
||||
ctx := context.Background()
|
||||
|
||||
keys, err := r.client.Keys(ctx, "priority:*").Result()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, key := range keys {
|
||||
visited, _ := r.client.HGet(ctx, key, "visited").Result()
|
||||
if visited == "1" {
|
||||
r.client.Del(ctx, key)
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- 内存索引操作(用于写入)----
|
||||
|
||||
// GetMemIndex 获取内存中的索引条目
|
||||
func (r *RedisStoreV2) GetMemIndex(keyword string) []IndexEntry {
|
||||
r.memMu.RLock()
|
||||
defer r.memMu.RUnlock()
|
||||
return r.mem[keyword]
|
||||
}
|
||||
|
||||
// SetMemIndex 设置内存中的索引条目
|
||||
func (r *RedisStoreV2) SetMemIndex(keyword string, entries []IndexEntry) {
|
||||
r.memMu.Lock()
|
||||
r.mem[keyword] = entries
|
||||
r.memMu.Unlock()
|
||||
}
|
||||
|
||||
// GetAllMemIndexes 获取所有内存索引
|
||||
func (r *RedisStoreV2) GetAllMemIndexes() map[string][]IndexEntry {
|
||||
r.memMu.RLock()
|
||||
defer r.memMu.RUnlock()
|
||||
|
||||
result := make(map[string][]IndexEntry, len(r.mem))
|
||||
for k, v := range r.mem {
|
||||
result[k] = v
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// GetRowCount 获取未刷盘的索引条目数
|
||||
func (r *RedisStoreV2) GetRowCount() int64 {
|
||||
return atomic.LoadInt64(&r.rowCount)
|
||||
}
|
||||
|
||||
// AddRowCount 增加未刷盘的索引条目计数
|
||||
func (r *RedisStoreV2) AddRowCount(delta int64) {
|
||||
atomic.AddInt64(&r.rowCount, delta)
|
||||
}
|
||||
|
||||
// SetRowCount 设置未刷盘的索引条目计数
|
||||
func (r *RedisStoreV2) SetRowCount(v int64) {
|
||||
atomic.StoreInt64(&r.rowCount, v)
|
||||
}
|
||||
|
||||
// FlushMemToRedis 将内存索引刷到 Redis
|
||||
func (r *RedisStoreV2) FlushMemToRedis() error {
|
||||
r.memMu.Lock()
|
||||
snapshot := r.mem
|
||||
r.mem = make(map[string][]IndexEntry)
|
||||
atomic.StoreInt64(&r.rowCount, 0)
|
||||
r.memMu.Unlock()
|
||||
|
||||
if len(snapshot) == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
// 合并内存和 Redis 数据,然后写回
|
||||
for keyword, memEntries := range snapshot {
|
||||
// 从 Redis 读取已有数据
|
||||
diskEntries, _ := r.GetIndex(keyword)
|
||||
|
||||
// 合并
|
||||
merged := mergeEntries(memEntries, diskEntries)
|
||||
|
||||
// 写回 Redis
|
||||
if err := r.setIndexEntries(keyword, merged); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *RedisStoreV2) setIndexEntries(keyword string, entries []IndexEntry) error {
|
||||
ctx := context.Background()
|
||||
|
||||
// 删除旧的
|
||||
r.client.Del(ctx, "idx:"+keyword)
|
||||
|
||||
if len(entries) > 0 {
|
||||
zSlice := make([]goredis.Z, len(entries))
|
||||
for i, e := range entries {
|
||||
zSlice[i] = goredis.Z{
|
||||
Score: float64(e.Weight),
|
||||
Member: e.URL,
|
||||
}
|
||||
}
|
||||
return r.client.ZAdd(ctx, "idx:"+keyword, zSlice...).Err()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func mergeEntries(newEntries, existingEntries []IndexEntry) []IndexEntry {
|
||||
seen := make(map[string]bool)
|
||||
var result []IndexEntry
|
||||
|
||||
// 先添加新条目
|
||||
for _, e := range newEntries {
|
||||
if !seen[e.URL] {
|
||||
result = append(result, e)
|
||||
seen[e.URL] = true
|
||||
}
|
||||
}
|
||||
|
||||
// 添加已有条目中不在新条目里的
|
||||
for _, e := range existingEntries {
|
||||
if !seen[e.URL] {
|
||||
result = append(result, e)
|
||||
seen[e.URL] = true
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// SiteCacheRefresh 从 Redis 刷新站点缓存
|
||||
func (r *RedisStoreV2) SiteCacheRefresh() error {
|
||||
ctx := context.Background()
|
||||
var cursor uint64
|
||||
|
||||
r.siteCacheMu.Lock()
|
||||
defer r.siteCacheMu.Unlock()
|
||||
|
||||
for {
|
||||
keys, nextCursor, err := r.client.Scan(ctx, cursor, "site:*", 1000).Result()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, key := range keys {
|
||||
host := key[5:] // 去掉 "site:" 前缀
|
||||
data, err := r.client.HGetAll(ctx, key).Result()
|
||||
if err != nil || len(data) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
info := &SiteInfo{
|
||||
VisitCount: int(parseInt64(data["visit_count"])),
|
||||
LastVisitTime: parseInt64(data["last_visit_time"]),
|
||||
}
|
||||
|
||||
if v, ok := data["success_rate"]; ok {
|
||||
f := parseFloat64(v)
|
||||
info.SuccessRate = &f
|
||||
}
|
||||
if v, ok := data["https_available"]; ok {
|
||||
b := parseInt64(v) == 1
|
||||
info.HTTPSAvailable = &b
|
||||
}
|
||||
|
||||
r.siteCache[host] = info
|
||||
}
|
||||
|
||||
cursor = nextCursor
|
||||
if cursor == 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ---- 辅助函数 ----
|
||||
|
||||
func urlHash(url string) string {
|
||||
return fmt.Sprintf("%x", md5.Sum([]byte(url)))
|
||||
}
|
||||
|
||||
func parseInt64(s string) int64 {
|
||||
var v int64
|
||||
fmt.Sscanf(s, "%d", &v)
|
||||
return v
|
||||
}
|
||||
|
||||
func parseFloat64(s string) float64 {
|
||||
var v float64
|
||||
fmt.Sscanf(s, "%f", &v)
|
||||
return v
|
||||
}
|
||||
|
||||
func boolToStr(b bool) string {
|
||||
if b {
|
||||
return "1"
|
||||
}
|
||||
return "0"
|
||||
}
|
||||
Reference in New Issue
Block a user