修改成redis
This commit is contained in:
+238
-80
@@ -5,6 +5,7 @@ package search
|
||||
import (
|
||||
"container/heap" // 堆结构(域名交错排序)
|
||||
"container/list" // 双向链表(LRU 缓存)
|
||||
"context" // 超时控制(Redis 操作)
|
||||
"encoding/json" // JSON 序列化(响应输出)
|
||||
"fmt" // 错误格式化
|
||||
"io" // 读取请求体
|
||||
@@ -24,27 +25,34 @@ import (
|
||||
"sync/atomic" // 原子操作(计数器)
|
||||
"time" // 时间戳
|
||||
|
||||
"sese-engine/analyzer" // 分词和语言检测
|
||||
"sese-engine/config" // 排序权重配置
|
||||
"sese-engine/crawler" // 爬虫(读取活跃线程数)
|
||||
"sese-engine/info" // info 服务
|
||||
"sese-engine/parser" // HTML 解析(在线摘要)
|
||||
"sese-engine/storage" // 持久化存储
|
||||
goredis "github.com/redis/go-redis/v9" // Redis 客户端
|
||||
"sese-engine/analyzer" // 分词和语言检测
|
||||
"sese-engine/config" // 排序权重配置
|
||||
"sese-engine/crawler" // 爬虫(读取活跃线程数)
|
||||
"sese-engine/info" // info 服务
|
||||
"sese-engine/parser" // HTML 解析(在线摘要)
|
||||
sredis "sese-engine/redis" // 本地 Redis 连接(用于访问 sredis.Client)
|
||||
"sese-engine/storage" // 持久化存储
|
||||
)
|
||||
|
||||
// urlKeywordsCache URL→关键词 的 LRU 缓存
|
||||
// urlKeywordsCache URL→关键词 的 LRU 缓存(内存)+ Redis 镜像(TTL 2小时)
|
||||
type urlKeywordsCache struct {
|
||||
mu sync.RWMutex
|
||||
items map[string]*list.Element // URL → list 节点
|
||||
order *list.List // 按访问时间排序的双向链表
|
||||
order *list.List // 按访问时间排序的双向链表
|
||||
maxSize int
|
||||
|
||||
// Redis 双写
|
||||
rdb *goredis.Client // Redis 客户端(懒初始化)
|
||||
redisKey string // Redis Hash key
|
||||
ttl time.Duration // TTL,默认 2 小时
|
||||
}
|
||||
|
||||
// urlKeywordsEntry LRU 缓存条目
|
||||
type urlKeywordsEntry struct {
|
||||
URL string // URL(用于删除时从 map 中移除)
|
||||
Title string // 页面标题(从 bbolt Snippet 缓存获取)
|
||||
Snippet string // 摘要(从 bbolt Snippet 缓存获取)
|
||||
Title string // 页面标题(从 Redis Snippet 缓存获取)
|
||||
Snippet string // 摘要(从 Redis Snippet 缓存获取)
|
||||
Keywords []urlKeywordInfo // 关键词列表
|
||||
}
|
||||
|
||||
@@ -54,102 +62,250 @@ type urlKeywordInfo struct {
|
||||
Weight float32 `json:"weight"` // 权重
|
||||
}
|
||||
|
||||
// newURLKeywordsCache 创建一个新的 LRU 缓存
|
||||
// newURLKeywordsCache 创建一个新的 LRU 缓存(Redis 镜像通过 AttachRedis 注入)
|
||||
func newURLKeywordsCache(maxSize int) *urlKeywordsCache {
|
||||
return &urlKeywordsCache{
|
||||
items: make(map[string]*list.Element),
|
||||
order: list.New(),
|
||||
maxSize: maxSize,
|
||||
items: make(map[string]*list.Element),
|
||||
order: list.New(),
|
||||
maxSize: maxSize,
|
||||
redisKey: "url_keywords:cache",
|
||||
ttl: 2 * time.Hour,
|
||||
}
|
||||
}
|
||||
|
||||
// Put 写入或更新缓存
|
||||
// AttachRedis 将 Redis 客户端注入缓存(用于双写)
|
||||
func (c *urlKeywordsCache) AttachRedis(rdb *goredis.Client) {
|
||||
c.rdb = rdb
|
||||
}
|
||||
|
||||
// Put 写入或更新缓存(内存 LRU + Redis 双写,2小时 TTL)
|
||||
func (c *urlKeywordsCache) Put(url string, title, snippet string, keywords []urlKeywordInfo) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
var entry *urlKeywordsEntry
|
||||
|
||||
// 已存在,移到队尾(更新新鲜度)
|
||||
if elem, ok := c.items[url]; ok {
|
||||
c.order.MoveToBack(elem)
|
||||
entry := elem.Value.(*urlKeywordsEntry)
|
||||
entry.Keywords = keywords
|
||||
entry.Title = title
|
||||
entry.Snippet = snippet
|
||||
return
|
||||
}
|
||||
e := elem.Value.(*urlKeywordsEntry)
|
||||
e.Keywords = keywords
|
||||
e.Title = title
|
||||
e.Snippet = snippet
|
||||
entry = e
|
||||
} else {
|
||||
// 新增到队尾
|
||||
entry = &urlKeywordsEntry{URL: url, Title: title, Snippet: snippet, Keywords: keywords}
|
||||
elem := c.order.PushBack(entry)
|
||||
c.items[url] = elem
|
||||
|
||||
// 新增到队尾
|
||||
entry := &urlKeywordsEntry{URL: url, Title: title, Snippet: snippet, Keywords: keywords}
|
||||
elem := c.order.PushBack(entry)
|
||||
c.items[url] = elem
|
||||
|
||||
// 超过上限,删除队首(最旧)
|
||||
if c.order.Len() > c.maxSize {
|
||||
oldest := c.order.Front()
|
||||
if oldest != nil {
|
||||
c.order.Remove(oldest)
|
||||
delete(c.items, oldest.Value.(*urlKeywordsEntry).URL)
|
||||
// 超过上限,删除队首(最旧)
|
||||
if c.order.Len() > c.maxSize {
|
||||
oldest := c.order.Front()
|
||||
if oldest != nil {
|
||||
c.order.Remove(oldest)
|
||||
delete(c.items, oldest.Value.(*urlKeywordsEntry).URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Redis 双写(异步,不阻塞主流程)
|
||||
go c.redisPut(url, entry)
|
||||
}
|
||||
|
||||
// Get 读取缓存,同时更新新鲜度
|
||||
func (c *urlKeywordsCache) Get(url string) (*urlKeywordsEntry, bool) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
// redisPut 将条目写入 Redis Hash(TTL 2小时)
|
||||
func (c *urlKeywordsCache) redisPut(url string, entry *urlKeywordsEntry) {
|
||||
if c.rdb == nil {
|
||||
return
|
||||
}
|
||||
data, err := json.Marshal(entry)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
pipe := c.rdb.Pipeline()
|
||||
pipe.HSet(ctx, c.redisKey, url, string(data))
|
||||
pipe.Expire(ctx, c.redisKey, c.ttl)
|
||||
pipe.Exec(ctx)
|
||||
}
|
||||
|
||||
if elem, ok := c.items[url]; ok {
|
||||
c.order.MoveToBack(elem) // 访问过,移到队尾
|
||||
// Get 读取缓存(内存优先,Redis 回填)
|
||||
func (c *urlKeywordsCache) Get(url string) (*urlKeywordsEntry, bool) {
|
||||
// 先查内存
|
||||
c.mu.RLock()
|
||||
elem, ok := c.items[url]
|
||||
if ok {
|
||||
entry := elem.Value.(*urlKeywordsEntry)
|
||||
c.mu.RUnlock()
|
||||
// 异步更新 Redis TTL
|
||||
go c.touchRedisTTL(url)
|
||||
return entry, true
|
||||
}
|
||||
return nil, false
|
||||
c.mu.RUnlock()
|
||||
|
||||
// 内存 miss,查 Redis
|
||||
entry, ok := c.redisGet(url)
|
||||
if !ok {
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// 回填内存(写锁)
|
||||
c.mu.Lock()
|
||||
// 二次检查(可能已有其他协程写入)
|
||||
if _, exists := c.items[url]; !exists {
|
||||
// 超过上限时,先删最旧的再加入
|
||||
if c.order.Len() >= c.maxSize {
|
||||
oldest := c.order.Front()
|
||||
if oldest != nil {
|
||||
c.order.Remove(oldest)
|
||||
delete(c.items, oldest.Value.(*urlKeywordsEntry).URL)
|
||||
}
|
||||
}
|
||||
elem := c.order.PushBack(entry)
|
||||
c.items[url] = elem
|
||||
}
|
||||
c.mu.Unlock()
|
||||
return entry, true
|
||||
}
|
||||
|
||||
// Stats 返回缓存统计信息
|
||||
// redisGet 从 Redis Hash 获取单条缓存
|
||||
func (c *urlKeywordsCache) redisGet(url string) (*urlKeywordsEntry, bool) {
|
||||
if c.rdb == nil {
|
||||
return nil, false
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
data, err := c.rdb.HGet(ctx, c.redisKey, url).Result()
|
||||
if err != nil {
|
||||
return nil, false
|
||||
}
|
||||
var entry urlKeywordsEntry
|
||||
if err := json.Unmarshal([]byte(data), &entry); err != nil {
|
||||
return nil, false
|
||||
}
|
||||
// 续命 Redis TTL
|
||||
go c.touchRedisTTL(url)
|
||||
return &entry, true
|
||||
}
|
||||
|
||||
// touchRedisTTL 续命 Redis 条目 TTL
|
||||
func (c *urlKeywordsCache) touchRedisTTL(url string) {
|
||||
if c.rdb == nil {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
c.rdb.Expire(ctx, c.redisKey, c.ttl)
|
||||
}
|
||||
|
||||
// Stats 返回缓存统计信息(优先 Redis 总数,包含 TTL 内的冷数据)
|
||||
func (c *urlKeywordsCache) Stats() (size int, maxSize int) {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
return len(c.items), c.maxSize
|
||||
}
|
||||
memSize := len(c.items)
|
||||
maxSize = c.maxSize
|
||||
c.mu.RUnlock()
|
||||
|
||||
// ListAll 返回所有缓存条目列表(按访问时间从旧到新)
|
||||
func (c *urlKeywordsCache) ListAll() []*urlKeywordsEntry {
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
entries := make([]*urlKeywordsEntry, 0, len(c.items))
|
||||
for elem := c.order.Front(); elem != nil; elem = elem.Next() {
|
||||
entries = append(entries, elem.Value.(*urlKeywordsEntry))
|
||||
// 尝试获取 Redis 总数(更准确,包含未淘汰的冷数据)
|
||||
if c.rdb != nil {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
n, err := c.rdb.HLen(ctx, c.redisKey).Result()
|
||||
if err == nil && n > 0 {
|
||||
return int(n), maxSize
|
||||
}
|
||||
}
|
||||
return entries
|
||||
return memSize, maxSize
|
||||
}
|
||||
|
||||
// ListPage 返回分页缓存条目(按访问时间从新到旧,最新访问的在前)
|
||||
func (c *urlKeywordsCache) ListPage(page, pageSize int) []*urlKeywordsEntry {
|
||||
// ListAll 返回所有缓存条目列表(内存 + Redis 冷数据合并)
|
||||
func (c *urlKeywordsCache) ListAll() []*urlKeywordsEntry {
|
||||
// 收集内存数据
|
||||
c.mu.RLock()
|
||||
defer c.mu.RUnlock()
|
||||
total := len(c.items)
|
||||
memEntries := make([]*urlKeywordsEntry, 0, len(c.items))
|
||||
for elem := c.order.Front(); elem != nil; elem = elem.Next() {
|
||||
memEntries = append(memEntries, elem.Value.(*urlKeywordsEntry))
|
||||
}
|
||||
c.mu.RUnlock()
|
||||
|
||||
// 尝试从 Redis 获取补充数据
|
||||
if c.rdb == nil {
|
||||
return memEntries
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
all, err := c.rdb.HGetAll(ctx, c.redisKey).Result()
|
||||
if err != nil || len(all) == 0 {
|
||||
return memEntries
|
||||
}
|
||||
|
||||
// 去重合并(内存优先)
|
||||
seen := make(map[string]bool, len(memEntries))
|
||||
for _, e := range memEntries {
|
||||
seen[e.URL] = true
|
||||
}
|
||||
for _, v := range all {
|
||||
var entry urlKeywordsEntry
|
||||
if err := json.Unmarshal([]byte(v), &entry); err != nil {
|
||||
continue
|
||||
}
|
||||
if !seen[entry.URL] {
|
||||
memEntries = append(memEntries, &entry)
|
||||
seen[entry.URL] = true
|
||||
}
|
||||
}
|
||||
return memEntries
|
||||
}
|
||||
|
||||
// ListPage 返回分页缓存条目(内存 + Redis 冷数据合并,按 URL 字典序排列)
|
||||
func (c *urlKeywordsCache) ListPage(page, pageSize int) []*urlKeywordsEntry {
|
||||
// 收集内存数据
|
||||
c.mu.RLock()
|
||||
memEntries := make([]*urlKeywordsEntry, 0, len(c.items))
|
||||
for elem := c.order.Front(); elem != nil; elem = elem.Next() {
|
||||
memEntries = append(memEntries, elem.Value.(*urlKeywordsEntry))
|
||||
}
|
||||
c.mu.RUnlock()
|
||||
|
||||
// 从 Redis 补充冷数据
|
||||
if c.rdb != nil {
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
||||
defer cancel()
|
||||
all, err := c.rdb.HGetAll(ctx, c.redisKey).Result()
|
||||
if err == nil && len(all) > 0 {
|
||||
seen := make(map[string]bool, len(memEntries))
|
||||
for _, e := range memEntries {
|
||||
seen[e.URL] = true
|
||||
}
|
||||
for _, v := range all {
|
||||
var entry urlKeywordsEntry
|
||||
if err := json.Unmarshal([]byte(v), &entry); err != nil {
|
||||
continue
|
||||
}
|
||||
if !seen[entry.URL] {
|
||||
memEntries = append(memEntries, &entry)
|
||||
seen[entry.URL] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 分页(从合并数据中按字典序取 page 页)
|
||||
total := len(memEntries)
|
||||
offset := (page - 1) * pageSize
|
||||
if offset >= total {
|
||||
return []*urlKeywordsEntry{}
|
||||
}
|
||||
// 从 Back(最新)向前遍历,跳过 offset 个
|
||||
elem := c.order.Back()
|
||||
for i := 0; i < offset && elem != nil; i++ {
|
||||
elem = elem.Prev()
|
||||
end := offset + pageSize
|
||||
if end > total {
|
||||
end = total
|
||||
}
|
||||
// 收集 pageSize 条
|
||||
entries := make([]*urlKeywordsEntry, 0, pageSize)
|
||||
for i := 0; i < pageSize && elem != nil; i++ {
|
||||
entries = append(entries, elem.Value.(*urlKeywordsEntry))
|
||||
elem = elem.Prev()
|
||||
}
|
||||
return entries
|
||||
return memEntries[offset:end]
|
||||
}
|
||||
|
||||
// Server 是搜索 HTTP 服务器,同时内嵌收获服务(统一在同一端口)。
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
db *storage.RedisStoreV2
|
||||
infoSvc *info.Service
|
||||
analyzer *analyzer.Analyzer
|
||||
httpCli *http.Client // 在线摘要抓取(无 robots.txt 检查)
|
||||
@@ -166,11 +322,11 @@ type Server struct {
|
||||
indexCacheMu sync.RWMutex
|
||||
indexCacheHits int64 // 缓存命中计数(原子)
|
||||
|
||||
// stats 快照缓存:后台定时刷新,避免每次请求全量遍历 bbolt
|
||||
// stats 快照缓存:后台定时刷新,避免每次请求全量遍历 Redis
|
||||
statsCache map[string]any
|
||||
statsCacheMu sync.RWMutex
|
||||
|
||||
// recent 快照缓存:后台定时刷新,避免每次请求全量遍历 bbolt
|
||||
// recent 快照缓存:后台定时刷新,避免每次请求全量遍历 Redis
|
||||
recentCache map[int][]recentItem // limit → 预截取的结果列表
|
||||
recentCacheMu sync.RWMutex
|
||||
recentTotal int // 总条目数(不截取)
|
||||
@@ -192,7 +348,7 @@ type Server struct {
|
||||
}
|
||||
|
||||
// New 创建一个 search Server(内嵌收获服务,统一在同一端口)。
|
||||
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
func New(db *storage.RedisStoreV2, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
s := &Server{
|
||||
db: db,
|
||||
infoSvc: infoSvc,
|
||||
@@ -203,6 +359,8 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
},
|
||||
urlKeywords: newURLKeywordsCache(10000), // 缓存 1 万个 URL 的关键词
|
||||
}
|
||||
// 注入 Redis 客户端到 urlKeywords 缓存(用于双写,2小时 TTL)
|
||||
s.urlKeywords.AttachRedis(sredis.Client)
|
||||
// 启动定期刷盘 goroutine
|
||||
go s.runPeriodicFlush()
|
||||
// 启动 stats + recent 缓存定期刷新 goroutine
|
||||
@@ -383,7 +541,7 @@ type recentItem struct {
|
||||
}
|
||||
|
||||
// handleAdminRecent 返回最近爬取的条目列表,按爬取时间倒序。
|
||||
// 直接返回缓存快照,不阻塞 bbolt,响应时间 <1ms。
|
||||
// 直接返回缓存快照,不阻塞 Redis,响应时间 <1ms。
|
||||
// 参数:limit(默认50,最大200)。
|
||||
func (s *Server) handleAdminRecent(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
@@ -419,7 +577,7 @@ func (s *Server) handleAdminRecent(w http.ResponseWriter, r *http.Request) {
|
||||
})
|
||||
}
|
||||
|
||||
// refreshRecentCache 全量遍历 bbolt 计算 recent 快照,预截取常用 limit,存入 recentCache。
|
||||
// refreshRecentCache 全量遍历 Redis 计算 recent 快照,预截取常用 limit,存入 recentCache。
|
||||
func (s *Server) refreshRecentCache() {
|
||||
type entry struct {
|
||||
url string
|
||||
@@ -495,7 +653,7 @@ func (s *Server) refreshRecentCache() {
|
||||
}
|
||||
|
||||
// handleAdminStats 返回全局统计:域名分布、语种分布、总 URL 数、总词数。
|
||||
// 直接返回缓存快照,不阻塞 bbolt,响应时间 <1ms。
|
||||
// 直接返回缓存快照,不阻塞 Redis,响应时间 <1ms。
|
||||
func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
@@ -523,7 +681,7 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewEncoder(w).Encode(cached)
|
||||
}
|
||||
|
||||
// refreshStatsCache 全量遍历 bbolt 计算统计快照,存入 statsCache。
|
||||
// refreshStatsCache 全量遍历 Redis 计算统计快照,存入 statsCache。
|
||||
func (s *Server) refreshStatsCache() {
|
||||
domainCount := make(map[string]int)
|
||||
langCount := make(map[string]int)
|
||||
@@ -556,7 +714,7 @@ func (s *Server) refreshStatsCache() {
|
||||
return nil
|
||||
})
|
||||
|
||||
// 遍历结束后批量查 SiteInfo(避免 ForEachSnippet 回调中嵌套 bbolt 事务)
|
||||
// 遍历结束后批量查 SiteInfo(避免 ForEachSnippet 回调中嵌套 Redis 事务)
|
||||
for _, domain := range snippetDomains {
|
||||
siteInfo, _ := s.db.GetSiteInfo(domain)
|
||||
if siteInfo != nil {
|
||||
@@ -616,7 +774,7 @@ func (s *Server) refreshStatsCache() {
|
||||
}
|
||||
|
||||
// runCacheRefresher 后台定时刷新 stats 和 recent 缓存。
|
||||
// 统一由一个 goroutine 交替刷新,避免同时全量遍历 bbolt 造成压力。
|
||||
// 统一由一个 goroutine 交替刷新,避免同时全量遍历 Redis 造成压力。
|
||||
func (s *Server) runCacheRefresher() {
|
||||
interval := time.Duration(config.StatsRefreshInterval()) * time.Second
|
||||
ticker := time.NewTicker(interval)
|
||||
@@ -1487,8 +1645,8 @@ func timeMul(si *storage.SiteInfo, now int64) float64 {
|
||||
}
|
||||
|
||||
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
||||
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||
entry, err := db.GetSnippet(rawURL)
|
||||
func urlTimeMul(store *storage.RedisStoreV2, rawURL string, now int64) float64 {
|
||||
entry, err := store.GetSnippet(rawURL)
|
||||
if err != nil || entry == nil {
|
||||
return 1.0
|
||||
}
|
||||
@@ -1941,7 +2099,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
go s.Flush()
|
||||
}
|
||||
|
||||
// 更新 URL→关键词 LRU 缓存(从 bbolt 获取标题和摘要)
|
||||
// 更新 URL→关键词 LRU 缓存(从 Redis 获取标题和摘要)
|
||||
keywords := make([]urlKeywordInfo, len(payload.Keywords))
|
||||
for i, kw := range payload.Keywords {
|
||||
keywords[i] = urlKeywordInfo{
|
||||
@@ -1950,7 +2108,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
title, snippet := "", ""
|
||||
if snippetEntry, err := s.db.GetSnippet(payload.URL); err == nil {
|
||||
if snippetEntry, err := s.db.GetSnippet(payload.URL); err == nil && snippetEntry != nil {
|
||||
title = snippetEntry.Title
|
||||
snippet = snippetEntry.Description
|
||||
if snippet == "" {
|
||||
|
||||
Reference in New Issue
Block a user