前端显示关键词,加入停词表

This commit is contained in:
2026-04-10 20:41:41 +08:00
parent 217f6e7138
commit 69d3f37555
8 changed files with 220 additions and 19 deletions
+89 -8
View File
@@ -17,6 +17,46 @@ import (
"github.com/yanyiwu/gojieba" // GojiebaC++ 结巴分词的 Go 封装
)
// 内置中英文停用词表(大小写不敏感)
var defaultStopWords = map[string]bool{
// 中文停用词
"的": true, "了": true, "在": true, "是": true, "我": true,
"有": true, "和": true, "就": true, "不": true, "人": true,
"都": true, "一": true, "一个": true, "上": true, "也": true,
"很": true, "到": true, "说": true, "要": true, "去": true,
"你": true, "会": true, "着": true, "没有": true, "看": true,
"好": true, "自己": true, "这": true, "那": true, "个": true,
"之": true, "与": true, "及": true, "或": true, "而": true,
"以": true, "为": true, "于": true, "被": true, "把": true,
"让": true, "给": true, "向": true, "从": true, "对": true,
"将": true, "地": true, "得": true, "过": true,
"吗": true, "呢": true, "吧": true, "啊": true, "哦": true,
"嗯": true, "唉": true, "哟": true, "嘿": true, "哈": true,
// 英文停用词
"a": true, "i": true, "the": true, "of": true, "to": true,
"and": true, "in": true, "is": true, "that": true, "for": true,
"it": true, "with": true, "as": true, "was": true, "on": true,
"by": true, "at": true, "be": true, "this": true, "have": true,
"from": true, "or": true, "one": true, "had": true, "word": true,
"but": true, "not": true, "what": true, "all": true, "were": true,
"we": true, "when": true, "your": true, "can": true, "said": true,
"there": true, "each": true, "which": true, "she": true, "do": true,
"how": true, "their": true, "if": true, "will": true, "up": true,
"other": true, "about": true, "out": true, "many": true, "then": true,
"them": true, "these": true, "so": true, "some": true, "her": true,
"would": true, "make": true, "like": true, "into": true, "him": true,
"has": true, "two": true, "more": true, "go": true, "no": true,
"way": true, "could": true, "my": true, "than": true, "first": true,
"been": true, "call": true, "who": true, "its": true, "now": true,
"find": true, "long": true, "down": true, "day": true, "did": true,
"get": true, "come": true, "made": true, "may": true, "part": true,
"an": true, "use": true, "his": true, "he": true, "also": true,
"after": true, "back": true, "only": true, "know": true, "take": true,
"year": true, "good": true, "just": true, "see": true, "over": true,
"think": true, "work": true, "life": true, "without": true, "through": true,
}
// Keyword 表示一个关键词及其权重。
type Keyword struct {
Word string `json:"word"` // 分词后的单词/词组
@@ -61,21 +101,26 @@ func (a *Analyzer) Close() {
// loadStopWords 从 JSON 文件加载屏蔽词列表到 map 中(O(1) 查找)。
// JSON 格式:字符串数组,如 ["", "。", "的", "了"]。
// 文件不存在或格式错误时返回空 map
// 文件不存在或格式错误时返回内置停用词表
func loadStopWords(path string) map[string]bool {
// 先复制内置停用词表
m := make(map[string]bool, len(defaultStopWords))
for k, v := range defaultStopWords {
m[k] = v
}
if path == "" {
return map[string]bool{}
return m
}
f, err := os.Open(path)
if err != nil {
return map[string]bool{}
return m
}
defer f.Close()
var words []string
if err := json.NewDecoder(f).Decode(&words); err != nil {
return map[string]bool{}
return m
}
m := make(map[string]bool, len(words))
for _, w := range words {
m[strings.ToLower(w)] = true // 转为小写存储,大小写不敏感
}
@@ -131,6 +176,42 @@ func Normalize(s string) string {
return b.String()
}
// isValidKeyword 判断一个词是否应保留作为关键词。
// 过滤规则:
// 1. 长度 >= 2(过滤单字符)
// 2. 非纯数字
// 3. 非停用词
func isValidKeyword(word string, stopWords map[string]bool) bool {
if word == "" {
return false
}
// 检查停用词
if stopWords[word] {
return false
}
// 过滤超长词(超过 32 字符)
if len(word) > 32 {
return false
}
// 检查是否为纯数字
isNum := true
for _, r := range word {
if r < '0' || r > '9' {
isNum = false
break
}
}
if isNum {
return false
}
// 过滤单字符(中文单字或英文单字母)
runes := []rune(word)
if len(runes) < 2 {
return false
}
return true
}
// weightedTokens 对一段文本计算每个分词的 TF(词频)权重,返回 token→权重 map。
// w 为权重倍数(标题权重 1.0,描述权重 0.5,正文权重 1.0)。
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
@@ -141,8 +222,8 @@ func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
counts := make(map[string]int)
for _, t := range tokens {
t = Normalize(t)
// 跳过空词、屏蔽词、超长词(超过 32 字符)
if t == "" || a.stopWords[t] || len(t) > 32 {
// 跳过无效关键词
if !isValidKeyword(t, a.stopWords) {
continue
}
counts[t]++
@@ -197,7 +278,7 @@ func (a *Analyzer) Segment(query string, searchMode bool) []string {
var result []string
for _, t := range tokens {
t = Normalize(t)
if t == "" || a.stopWords[t] || len(t) > 32 {
if !isValidKeyword(t, a.stopWords) {
continue
}
result = append(result, t)