fix 分词bug，添加重爬机制

2026-04-10 00:18:07 +08:00
parent 7ab7db9b76
commit 530e2ebd9d
9 changed files with 208 additions and 34 deletions
@@ -265,12 +265,18 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
 	langCount := make(map[string]int)
 	totalWords := 0
 	total := 0
+	recrawlEligible := 0
+	now := time.Now().Unix()
+	maxAge := int64(config.RecrawlMaxAge())

 	s.db.ForEachSnippet(func(url string, snippet *storage.SnippetEntry) error {
 		total++
 		domain := netloc(url)
 		domainCount[domain]++
 		totalWords += len(snippet.Text)
+		if now-snippet.Timestamp >= maxAge {
+			recrawlEligible++
+		}

 		siteInfo, _ := s.db.GetSiteInfo(domain)
 		if siteInfo != nil {
@@ -315,12 +321,13 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
 	}

 	resp := map[string]any{
-		"total_urls":    total,
-		"total_words":   totalWords,
-		"total_domains": len(domainCount), // 真实的域名总数（非Top 20）
-		"domains":       domainsMap,
-		"languages":     langsMap,
-		"pending":       atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数
+		"total_urls":        total,
+		"total_words":       totalWords,
+		"total_domains":     len(domainCount), // 真实的域名总数（非Top 20）
+		"domains":           domainsMap,
+		"languages":         langsMap,
+		"pending":           atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数
+		"recrawl_eligible":  recrawlEligible,               // 已过期、可被重爬的 URL 数量
 	}

 	json.NewEncoder(w).Encode(resp)
@@ -575,7 +582,8 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
 		if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
 			siteFilter = m[1] // site:example.com 提取目标主机名
 		} else {
-			segs := s.analyzer.Segment(part, false)
+			// 搜索模式分词（CutForSearch）：更细粒度，"气象局" → ["气象", "局", "气象局"]
+			segs := s.analyzer.Segment(part, true)
 			for _, t := range segs {
 				if !s.infoSvc.IsBlocked(t) { // 过滤屏蔽词
 					tokens = append(tokens, t)
@@ -584,6 +592,11 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
 		}
 	}

+	// 智能去重：当一个词是另一个词的子串时，保留两者但标记子词关系。
+	// 这样评分时可以用长词（精确匹配）加分，同时不因缺少短词而过度惩罚。
+	// 算法：对于每一对词 (a, b)，如果 a 是 b 的子串且 a != b，则 a 是 b 的子词。
+	tokens = deduplicateSubstrings(tokens)
+
 	// 最多保留 20 个词（避免查询过于宽泛）
 	if len(tokens) > 20 {
 		tokens = tokens[:20]
@@ -691,10 +704,15 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
 	}

 	// 计算每个 URL 的相关性和初始分数
+	// 评分策略：部分匹配加权和 + 缺词软惩罚（替代原来的全词乘积）
+	// 全词乘积问题：一个 URL 只要缺少任何一个查询词，rel 就接近 0，
+	// 导致"气象局"拆成 ["气象局","局"] 后，只有"气象"没有"气象局"的页面被淹没。
+	missPenalty := config.MissPenalty()
 	candidates := make([]candidate, 0, len(urlWeights))
 	for u, vs := range urlWeights {
-		// 词权重相乘（贝叶斯概率近似），缺省权重填充缺失词
-		rel := 1.0
+		// 统计实际匹配的词数和权重总和
+		matchedCount := 0
+		sumWeight := 0.0
 		for _, ti := range tokenIndexes {
 			vp := vs[ti.token]
 			if vp == 0 {
@@ -703,8 +721,23 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
 			if vp > 0.06 {
 				vp = math.Log((vp-0.06)*40+1)/40 + 0.06
 			}
-			rel *= vp
+			sumWeight += vp
+			// 只有权重超过默认值才算真正匹配（排除了 defVal 填充的假匹配）
+			if vs[ti.token] > 0 {
+				matchedCount++
+			}
 		}
+		totalTokens := len(tokenIndexes)
+		// 部分匹配相关性 = 加权平均 × 匹配覆盖率加成
+		// matchRatio：匹配词占比，全部匹配=1，全部缺失=0
+		matchRatio := float64(matchedCount) / float64(totalTokens)
+		// avgWeight：匹配词的平均权重
+		avgWeight := sumWeight / float64(totalTokens)
+		// rel = 平均权重 × (匹配率 + 未匹配部分的软惩罚)
+		// missPenalty 控制未匹配词的惩罚力度：
+		//   0 = 完全不惩罚（纯加权和）
+		//   1 = 缺词的权重取 0（等同于全词乘积的极端情况）
+		rel := avgWeight * (matchRatio + (1-matchRatio)*missPenalty)
 		// 反向链接繁荣加分
 		prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight()
 		bad := badURL(u)
@@ -969,6 +1002,43 @@ func badURL(u string) float64 {
 	return math.Min(s, 0.9)
 }

+// deduplicateSubstrings 对分词结果进行智能去重。
+// 当词 A 是词 B 的子串时（A ≠ B），移除较短的 A。
+// 例如 ["气象", "局", "气象局"] → ["气象局", "局"]
+// 保留最长词以确保精确匹配优先，同时短词作为兜底召回。
+func deduplicateSubstrings(tokens []string) []string {
+	if len(tokens) <= 1 {
+		return tokens
+	}
+	// 按长度降序排列，等长按字典序
+	sort.Slice(tokens, func(i, j int) bool {
+		if len(tokens[i]) != len(tokens[j]) {
+			return len(tokens[i]) > len(tokens[j])
+		}
+		return tokens[i] < tokens[j]
+	})
+	seen := make(map[string]bool)
+	var result []string
+	for _, t := range tokens {
+		if seen[t] {
+			continue // 完全重复的词跳过
+		}
+		seen[t] = true
+		// 检查是否已被更长的词包含（t 是某个已保留词的子串）
+		isSubstr := false
+		for _, kept := range result {
+			if strings.Contains(kept, t) && kept != t {
+				isSubstr = true
+				break
+			}
+		}
+		if !isSubstr {
+			result = append(result, t)
+		}
+	}
+	return result
+}
+
 // netloc 从 URL 提取主机名。
 func netloc(rawURL string) string {
 	parts := strings.SplitN(rawURL, "/", 4)