前端显示关键词,加入停词表

This commit is contained in:
2026-04-10 20:41:41 +08:00
parent 217f6e7138
commit 69d3f37555
8 changed files with 220 additions and 19 deletions
+120
View File
@@ -4,6 +4,7 @@ package search
import (
"container/heap" // 堆结构(域名交错排序)
"container/list" // 双向链表(LRU 缓存)
"encoding/json" // JSON 序列化(响应输出)
"fmt" // 错误格式化
"io" // 读取请求体
@@ -29,6 +30,74 @@ import (
"sese-engine/storage" // 持久化存储
)
// urlKeywordsCache URL→关键词 的 LRU 缓存
type urlKeywordsCache struct {
mu sync.RWMutex
items map[string]*list.Element // URL → list 节点
order *list.List // 按访问时间排序的双向链表
maxSize int
}
// urlKeywordsEntry LRU 缓存条目
type urlKeywordsEntry struct {
URL string // URL(用于删除时从 map 中移除)
Keywords []urlKeywordInfo // 关键词列表
}
// urlKeywordInfo 单个关键词信息
type urlKeywordInfo struct {
Word string `json:"word"` // 关键词
Weight float32 `json:"weight"` // 权重
}
// newURLKeywordsCache 创建一个新的 LRU 缓存
func newURLKeywordsCache(maxSize int) *urlKeywordsCache {
return &urlKeywordsCache{
items: make(map[string]*list.Element),
order: list.New(),
maxSize: maxSize,
}
}
// Put 写入或更新缓存
func (c *urlKeywordsCache) Put(url string, keywords []urlKeywordInfo) {
c.mu.Lock()
defer c.mu.Unlock()
// 已存在,移到队尾(更新新鲜度)
if elem, ok := c.items[url]; ok {
c.order.MoveToBack(elem)
elem.Value.(*urlKeywordsEntry).Keywords = keywords
return
}
// 新增到队尾
entry := &urlKeywordsEntry{URL: url, Keywords: keywords}
elem := c.order.PushBack(entry)
c.items[url] = elem
// 超过上限,删除队首(最旧)
if c.order.Len() > c.maxSize {
oldest := c.order.Front()
if oldest != nil {
c.order.Remove(oldest)
delete(c.items, oldest.Value.(*urlKeywordsEntry).URL)
}
}
}
// Get 读取缓存,同时更新新鲜度
func (c *urlKeywordsCache) Get(url string) ([]urlKeywordInfo, bool) {
c.mu.Lock()
defer c.mu.Unlock()
if elem, ok := c.items[url]; ok {
c.order.MoveToBack(elem) // 访问过,移到队尾
return elem.Value.(*urlKeywordsEntry).Keywords, true
}
return nil, false
}
// Server 是搜索 HTTP 服务器,同时内嵌收获服务(统一在同一端口)。
type Server struct {
db *storage.DB
@@ -57,6 +126,9 @@ type Server struct {
recentCacheMu sync.RWMutex
recentTotal int // 总条目数(不截取)
// urlKeywords URL→关键词 LRU 缓存(用于最近爬取页面显示关键词)
urlKeywords *urlKeywordsCache
// backlinkRunner 反向链接计算器(可为 nil,仅用于 admin 手动触发)
backlinkRunner interface {
Status() map[string]interface{}
@@ -80,6 +152,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
httpCli: &http.Client{
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
},
urlKeywords: newURLKeywordsCache(10000), // 缓存 1 万个 URL 的关键词
}
// 启动定期刷盘 goroutine
go s.runPeriodicFlush()
@@ -135,6 +208,7 @@ func (s *Server) Handler() http.Handler {
mux.HandleFunc("/admin/workers", s.handleAdminWorkers)
mux.HandleFunc("/admin/backlink", s.handleAdminBacklink)
mux.HandleFunc("/admin/crawl/status", s.handleAdminCrawlStatus)
mux.HandleFunc("/admin/url/keywords", s.handleUrlKeywords)
// 静态文件(SPA fallback
mux.Handle("/", spaHandler{dist: "dist"})
return mux
@@ -738,6 +812,41 @@ func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request)
json.NewEncoder(w).Encode(s.crawler.GetCrawlStatus())
}
// handleUrlKeywords 返回指定 URL 的关键词列表(LRU 缓存查询)
func (s *Server) handleUrlKeywords(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
if r.Method != http.MethodGet && r.Method != http.MethodOptions {
http.Error(w, `{"error":"method not allowed"}`, http.StatusMethodNotAllowed)
return
}
if r.Method == http.MethodOptions {
w.WriteHeader(http.StatusNoContent)
return
}
url := r.URL.Query().Get("url")
if url == "" {
http.Error(w, `{"error":"missing url param"}`, http.StatusBadRequest)
return
}
keywords, ok := s.urlKeywords.Get(url)
resp := struct {
URL string `json:"url"`
Keywords []urlKeywordInfo `json:"keywords"`
Cached bool `json:"cached"`
}{
URL: url,
Keywords: keywords,
Cached: ok,
}
json.NewEncoder(w).Encode(resp)
}
// ---- 搜索处理器 ----
// searchResponse 是搜索 API 的 JSON 响应结构。
@@ -1653,6 +1762,17 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
go s.Flush()
}
// 更新 URL→关键词 LRU 缓存
keywords := make([]urlKeywordInfo, len(payload.Keywords))
for i, kw := range payload.Keywords {
keywords[i] = urlKeywordInfo{
Word: kw.Word,
Weight: kw.Weight,
}
}
s.urlKeywords.Put(payload.URL, keywords)
w.Write([]byte("ok"))
}