前端显示关键词,加入停词表

This commit is contained in:
2026-04-10 20:41:41 +08:00
parent 217f6e7138
commit 69d3f37555
8 changed files with 220 additions and 19 deletions
+89 -8
View File
@@ -17,6 +17,46 @@ import (
"github.com/yanyiwu/gojieba" // GojiebaC++ 结巴分词的 Go 封装
)
// 内置中英文停用词表(大小写不敏感)
var defaultStopWords = map[string]bool{
// 中文停用词
"的": true, "了": true, "在": true, "是": true, "我": true,
"有": true, "和": true, "就": true, "不": true, "人": true,
"都": true, "一": true, "一个": true, "上": true, "也": true,
"很": true, "到": true, "说": true, "要": true, "去": true,
"你": true, "会": true, "着": true, "没有": true, "看": true,
"好": true, "自己": true, "这": true, "那": true, "个": true,
"之": true, "与": true, "及": true, "或": true, "而": true,
"以": true, "为": true, "于": true, "被": true, "把": true,
"让": true, "给": true, "向": true, "从": true, "对": true,
"将": true, "地": true, "得": true, "过": true,
"吗": true, "呢": true, "吧": true, "啊": true, "哦": true,
"嗯": true, "唉": true, "哟": true, "嘿": true, "哈": true,
// 英文停用词
"a": true, "i": true, "the": true, "of": true, "to": true,
"and": true, "in": true, "is": true, "that": true, "for": true,
"it": true, "with": true, "as": true, "was": true, "on": true,
"by": true, "at": true, "be": true, "this": true, "have": true,
"from": true, "or": true, "one": true, "had": true, "word": true,
"but": true, "not": true, "what": true, "all": true, "were": true,
"we": true, "when": true, "your": true, "can": true, "said": true,
"there": true, "each": true, "which": true, "she": true, "do": true,
"how": true, "their": true, "if": true, "will": true, "up": true,
"other": true, "about": true, "out": true, "many": true, "then": true,
"them": true, "these": true, "so": true, "some": true, "her": true,
"would": true, "make": true, "like": true, "into": true, "him": true,
"has": true, "two": true, "more": true, "go": true, "no": true,
"way": true, "could": true, "my": true, "than": true, "first": true,
"been": true, "call": true, "who": true, "its": true, "now": true,
"find": true, "long": true, "down": true, "day": true, "did": true,
"get": true, "come": true, "made": true, "may": true, "part": true,
"an": true, "use": true, "his": true, "he": true, "also": true,
"after": true, "back": true, "only": true, "know": true, "take": true,
"year": true, "good": true, "just": true, "see": true, "over": true,
"think": true, "work": true, "life": true, "without": true, "through": true,
}
// Keyword 表示一个关键词及其权重。
type Keyword struct {
Word string `json:"word"` // 分词后的单词/词组
@@ -61,21 +101,26 @@ func (a *Analyzer) Close() {
// loadStopWords 从 JSON 文件加载屏蔽词列表到 map 中(O(1) 查找)。
// JSON 格式:字符串数组,如 ["", "。", "的", "了"]。
// 文件不存在或格式错误时返回空 map
// 文件不存在或格式错误时返回内置停用词表
func loadStopWords(path string) map[string]bool {
// 先复制内置停用词表
m := make(map[string]bool, len(defaultStopWords))
for k, v := range defaultStopWords {
m[k] = v
}
if path == "" {
return map[string]bool{}
return m
}
f, err := os.Open(path)
if err != nil {
return map[string]bool{}
return m
}
defer f.Close()
var words []string
if err := json.NewDecoder(f).Decode(&words); err != nil {
return map[string]bool{}
return m
}
m := make(map[string]bool, len(words))
for _, w := range words {
m[strings.ToLower(w)] = true // 转为小写存储,大小写不敏感
}
@@ -131,6 +176,42 @@ func Normalize(s string) string {
return b.String()
}
// isValidKeyword 判断一个词是否应保留作为关键词。
// 过滤规则:
// 1. 长度 >= 2(过滤单字符)
// 2. 非纯数字
// 3. 非停用词
func isValidKeyword(word string, stopWords map[string]bool) bool {
if word == "" {
return false
}
// 检查停用词
if stopWords[word] {
return false
}
// 过滤超长词(超过 32 字符)
if len(word) > 32 {
return false
}
// 检查是否为纯数字
isNum := true
for _, r := range word {
if r < '0' || r > '9' {
isNum = false
break
}
}
if isNum {
return false
}
// 过滤单字符(中文单字或英文单字母)
runes := []rune(word)
if len(runes) < 2 {
return false
}
return true
}
// weightedTokens 对一段文本计算每个分词的 TF(词频)权重,返回 token→权重 map。
// w 为权重倍数(标题权重 1.0,描述权重 0.5,正文权重 1.0)。
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
@@ -141,8 +222,8 @@ func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
counts := make(map[string]int)
for _, t := range tokens {
t = Normalize(t)
// 跳过空词、屏蔽词、超长词(超过 32 字符)
if t == "" || a.stopWords[t] || len(t) > 32 {
// 跳过无效关键词
if !isValidKeyword(t, a.stopWords) {
continue
}
counts[t]++
@@ -197,7 +278,7 @@ func (a *Analyzer) Segment(query string, searchMode bool) []string {
var result []string
for _, t := range tokens {
t = Normalize(t)
if t == "" || a.stopWords[t] || len(t) > 32 {
if !isValidKeyword(t, a.stopWords) {
continue
}
result = append(result, t)
-6
View File
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+6
View File
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+2 -2
View File
@@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>SESE 爬取管理</title>
<script type="module" crossorigin src="/assets/index-BZZkVGv2.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-DlSLDihv.css">
<script type="module" crossorigin src="/assets/index-CxvnbVf9.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-BlMKGOqe.css">
</head>
<body>
<div id="app"></div>
+120
View File
@@ -4,6 +4,7 @@ package search
import (
"container/heap" // 堆结构(域名交错排序)
"container/list" // 双向链表(LRU 缓存)
"encoding/json" // JSON 序列化(响应输出)
"fmt" // 错误格式化
"io" // 读取请求体
@@ -29,6 +30,74 @@ import (
"sese-engine/storage" // 持久化存储
)
// urlKeywordsCache URL→关键词 的 LRU 缓存
type urlKeywordsCache struct {
mu sync.RWMutex
items map[string]*list.Element // URL → list 节点
order *list.List // 按访问时间排序的双向链表
maxSize int
}
// urlKeywordsEntry LRU 缓存条目
type urlKeywordsEntry struct {
URL string // URL(用于删除时从 map 中移除)
Keywords []urlKeywordInfo // 关键词列表
}
// urlKeywordInfo 单个关键词信息
type urlKeywordInfo struct {
Word string `json:"word"` // 关键词
Weight float32 `json:"weight"` // 权重
}
// newURLKeywordsCache 创建一个新的 LRU 缓存
func newURLKeywordsCache(maxSize int) *urlKeywordsCache {
return &urlKeywordsCache{
items: make(map[string]*list.Element),
order: list.New(),
maxSize: maxSize,
}
}
// Put 写入或更新缓存
func (c *urlKeywordsCache) Put(url string, keywords []urlKeywordInfo) {
c.mu.Lock()
defer c.mu.Unlock()
// 已存在,移到队尾(更新新鲜度)
if elem, ok := c.items[url]; ok {
c.order.MoveToBack(elem)
elem.Value.(*urlKeywordsEntry).Keywords = keywords
return
}
// 新增到队尾
entry := &urlKeywordsEntry{URL: url, Keywords: keywords}
elem := c.order.PushBack(entry)
c.items[url] = elem
// 超过上限,删除队首(最旧)
if c.order.Len() > c.maxSize {
oldest := c.order.Front()
if oldest != nil {
c.order.Remove(oldest)
delete(c.items, oldest.Value.(*urlKeywordsEntry).URL)
}
}
}
// Get 读取缓存,同时更新新鲜度
func (c *urlKeywordsCache) Get(url string) ([]urlKeywordInfo, bool) {
c.mu.Lock()
defer c.mu.Unlock()
if elem, ok := c.items[url]; ok {
c.order.MoveToBack(elem) // 访问过,移到队尾
return elem.Value.(*urlKeywordsEntry).Keywords, true
}
return nil, false
}
// Server 是搜索 HTTP 服务器,同时内嵌收获服务(统一在同一端口)。
type Server struct {
db *storage.DB
@@ -57,6 +126,9 @@ type Server struct {
recentCacheMu sync.RWMutex
recentTotal int // 总条目数(不截取)
// urlKeywords URL→关键词 LRU 缓存(用于最近爬取页面显示关键词)
urlKeywords *urlKeywordsCache
// backlinkRunner 反向链接计算器(可为 nil,仅用于 admin 手动触发)
backlinkRunner interface {
Status() map[string]interface{}
@@ -80,6 +152,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
httpCli: &http.Client{
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
},
urlKeywords: newURLKeywordsCache(10000), // 缓存 1 万个 URL 的关键词
}
// 启动定期刷盘 goroutine
go s.runPeriodicFlush()
@@ -135,6 +208,7 @@ func (s *Server) Handler() http.Handler {
mux.HandleFunc("/admin/workers", s.handleAdminWorkers)
mux.HandleFunc("/admin/backlink", s.handleAdminBacklink)
mux.HandleFunc("/admin/crawl/status", s.handleAdminCrawlStatus)
mux.HandleFunc("/admin/url/keywords", s.handleUrlKeywords)
// 静态文件(SPA fallback
mux.Handle("/", spaHandler{dist: "dist"})
return mux
@@ -738,6 +812,41 @@ func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request)
json.NewEncoder(w).Encode(s.crawler.GetCrawlStatus())
}
// handleUrlKeywords 返回指定 URL 的关键词列表(LRU 缓存查询)
func (s *Server) handleUrlKeywords(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
if r.Method != http.MethodGet && r.Method != http.MethodOptions {
http.Error(w, `{"error":"method not allowed"}`, http.StatusMethodNotAllowed)
return
}
if r.Method == http.MethodOptions {
w.WriteHeader(http.StatusNoContent)
return
}
url := r.URL.Query().Get("url")
if url == "" {
http.Error(w, `{"error":"missing url param"}`, http.StatusBadRequest)
return
}
keywords, ok := s.urlKeywords.Get(url)
resp := struct {
URL string `json:"url"`
Keywords []urlKeywordInfo `json:"keywords"`
Cached bool `json:"cached"`
}{
URL: url,
Keywords: keywords,
Cached: ok,
}
json.NewEncoder(w).Encode(resp)
}
// ---- 搜索处理器 ----
// searchResponse 是搜索 API 的 JSON 响应结构。
@@ -1653,6 +1762,17 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
go s.Flush()
}
// 更新 URL→关键词 LRU 缓存
keywords := make([]urlKeywordInfo, len(payload.Keywords))
for i, kw := range payload.Keywords {
keywords[i] = urlKeywordInfo{
Word: kw.Word,
Weight: kw.Weight,
}
}
s.urlKeywords.Put(payload.URL, keywords)
w.Write([]byte("ok"))
}