前端显示关键词,加入停词表
This commit is contained in:
@@ -4,6 +4,7 @@ package search
|
||||
|
||||
import (
|
||||
"container/heap" // 堆结构(域名交错排序)
|
||||
"container/list" // 双向链表(LRU 缓存)
|
||||
"encoding/json" // JSON 序列化(响应输出)
|
||||
"fmt" // 错误格式化
|
||||
"io" // 读取请求体
|
||||
@@ -29,6 +30,74 @@ import (
|
||||
"sese-engine/storage" // 持久化存储
|
||||
)
|
||||
|
||||
// urlKeywordsCache URL→关键词 的 LRU 缓存
|
||||
type urlKeywordsCache struct {
|
||||
mu sync.RWMutex
|
||||
items map[string]*list.Element // URL → list 节点
|
||||
order *list.List // 按访问时间排序的双向链表
|
||||
maxSize int
|
||||
}
|
||||
|
||||
// urlKeywordsEntry LRU 缓存条目
|
||||
type urlKeywordsEntry struct {
|
||||
URL string // URL(用于删除时从 map 中移除)
|
||||
Keywords []urlKeywordInfo // 关键词列表
|
||||
}
|
||||
|
||||
// urlKeywordInfo 单个关键词信息
|
||||
type urlKeywordInfo struct {
|
||||
Word string `json:"word"` // 关键词
|
||||
Weight float32 `json:"weight"` // 权重
|
||||
}
|
||||
|
||||
// newURLKeywordsCache 创建一个新的 LRU 缓存
|
||||
func newURLKeywordsCache(maxSize int) *urlKeywordsCache {
|
||||
return &urlKeywordsCache{
|
||||
items: make(map[string]*list.Element),
|
||||
order: list.New(),
|
||||
maxSize: maxSize,
|
||||
}
|
||||
}
|
||||
|
||||
// Put 写入或更新缓存
|
||||
func (c *urlKeywordsCache) Put(url string, keywords []urlKeywordInfo) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
// 已存在,移到队尾(更新新鲜度)
|
||||
if elem, ok := c.items[url]; ok {
|
||||
c.order.MoveToBack(elem)
|
||||
elem.Value.(*urlKeywordsEntry).Keywords = keywords
|
||||
return
|
||||
}
|
||||
|
||||
// 新增到队尾
|
||||
entry := &urlKeywordsEntry{URL: url, Keywords: keywords}
|
||||
elem := c.order.PushBack(entry)
|
||||
c.items[url] = elem
|
||||
|
||||
// 超过上限,删除队首(最旧)
|
||||
if c.order.Len() > c.maxSize {
|
||||
oldest := c.order.Front()
|
||||
if oldest != nil {
|
||||
c.order.Remove(oldest)
|
||||
delete(c.items, oldest.Value.(*urlKeywordsEntry).URL)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get 读取缓存,同时更新新鲜度
|
||||
func (c *urlKeywordsCache) Get(url string) ([]urlKeywordInfo, bool) {
|
||||
c.mu.Lock()
|
||||
defer c.mu.Unlock()
|
||||
|
||||
if elem, ok := c.items[url]; ok {
|
||||
c.order.MoveToBack(elem) // 访问过,移到队尾
|
||||
return elem.Value.(*urlKeywordsEntry).Keywords, true
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// Server 是搜索 HTTP 服务器,同时内嵌收获服务(统一在同一端口)。
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
@@ -57,6 +126,9 @@ type Server struct {
|
||||
recentCacheMu sync.RWMutex
|
||||
recentTotal int // 总条目数(不截取)
|
||||
|
||||
// urlKeywords URL→关键词 LRU 缓存(用于最近爬取页面显示关键词)
|
||||
urlKeywords *urlKeywordsCache
|
||||
|
||||
// backlinkRunner 反向链接计算器(可为 nil,仅用于 admin 手动触发)
|
||||
backlinkRunner interface {
|
||||
Status() map[string]interface{}
|
||||
@@ -80,6 +152,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
httpCli: &http.Client{
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
|
||||
},
|
||||
urlKeywords: newURLKeywordsCache(10000), // 缓存 1 万个 URL 的关键词
|
||||
}
|
||||
// 启动定期刷盘 goroutine
|
||||
go s.runPeriodicFlush()
|
||||
@@ -135,6 +208,7 @@ func (s *Server) Handler() http.Handler {
|
||||
mux.HandleFunc("/admin/workers", s.handleAdminWorkers)
|
||||
mux.HandleFunc("/admin/backlink", s.handleAdminBacklink)
|
||||
mux.HandleFunc("/admin/crawl/status", s.handleAdminCrawlStatus)
|
||||
mux.HandleFunc("/admin/url/keywords", s.handleUrlKeywords)
|
||||
// 静态文件(SPA fallback)
|
||||
mux.Handle("/", spaHandler{dist: "dist"})
|
||||
return mux
|
||||
@@ -738,6 +812,41 @@ func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request)
|
||||
json.NewEncoder(w).Encode(s.crawler.GetCrawlStatus())
|
||||
}
|
||||
|
||||
// handleUrlKeywords 返回指定 URL 的关键词列表(LRU 缓存查询)
|
||||
func (s *Server) handleUrlKeywords(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
|
||||
if r.Method != http.MethodGet && r.Method != http.MethodOptions {
|
||||
http.Error(w, `{"error":"method not allowed"}`, http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
if r.Method == http.MethodOptions {
|
||||
w.WriteHeader(http.StatusNoContent)
|
||||
return
|
||||
}
|
||||
|
||||
url := r.URL.Query().Get("url")
|
||||
if url == "" {
|
||||
http.Error(w, `{"error":"missing url param"}`, http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
keywords, ok := s.urlKeywords.Get(url)
|
||||
|
||||
resp := struct {
|
||||
URL string `json:"url"`
|
||||
Keywords []urlKeywordInfo `json:"keywords"`
|
||||
Cached bool `json:"cached"`
|
||||
}{
|
||||
URL: url,
|
||||
Keywords: keywords,
|
||||
Cached: ok,
|
||||
}
|
||||
|
||||
json.NewEncoder(w).Encode(resp)
|
||||
}
|
||||
|
||||
// ---- 搜索处理器 ----
|
||||
|
||||
// searchResponse 是搜索 API 的 JSON 响应结构。
|
||||
@@ -1653,6 +1762,17 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
|
||||
go s.Flush()
|
||||
}
|
||||
|
||||
// 更新 URL→关键词 LRU 缓存
|
||||
keywords := make([]urlKeywordInfo, len(payload.Keywords))
|
||||
for i, kw := range payload.Keywords {
|
||||
keywords[i] = urlKeywordInfo{
|
||||
Word: kw.Word,
|
||||
Weight: kw.Weight,
|
||||
}
|
||||
}
|
||||
s.urlKeywords.Put(payload.URL, keywords)
|
||||
|
||||
w.Write([]byte("ok"))
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user