fix 分词bug，添加重爬机制

2026-04-10 00:18:07 +08:00
parent 7ab7db9b76
commit 530e2ebd9d
9 changed files with 208 additions and 34 deletions
@@ -44,6 +44,9 @@ type CrawlerConfig struct {
 	ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
 	EntryURL             string  `yaml:"entry_url"`
 	MaxPageSize          int     `yaml:"max_page_size"` // 单个页面最大抓取字节数（0=不限，默认 5MB）
+	RecrawlMaxAge        int     `yaml:"recrawl_max_age"`         // URL 过期时间（秒），超过此时间的 URL 允许被重爬，默认 30 天
+	RecrawlCheckInterval int     `yaml:"recrawl_check_interval"`  // 运行期间检查过期 URL 的间隔（秒），默认 1 小时
+	RecrawlBatchSize     int     `yaml:"recrawl_batch_size"`      // 每次检查最多释放多少个过期 URL，默认 500
 }

 // SearchConfig 搜索结果排序权重配置
@@ -56,6 +59,7 @@ type SearchConfig struct {
 	BacklinkWeight        float64 `yaml:"backlink_weight"`
 	ServerPort            int     `yaml:"server_port"`
 	FlushIntervalSeconds  int     `yaml:"flush_interval_seconds"`
+	MissPenalty           float64 `yaml:"miss_penalty"` // 缺词惩罚系数（0=不惩罚，1=完全忽略缺词URL），默认 0.15
 }

 // BacklinkConfig 反向链接计算相关配置
@@ -120,6 +124,9 @@ func GetDefaultConfig() Config {
 			ExpectedProsperRatio: 0.6,
 			EntryURL:             "https://zh.wikipedia.org/",
 			MaxPageSize:          5 * 1024 * 1024,
+			RecrawlMaxAge:        30 * 86400, // 30 天
+			RecrawlCheckInterval: 3600,       // 1 小时
+			RecrawlBatchSize:     500,
 		},
 		Search: SearchConfig{
 			UseOnlineSnippet:     true,
@@ -130,6 +137,7 @@ func GetDefaultConfig() Config {
 			BacklinkWeight:       1.0,
 			ServerPort:           50082,
 			FlushIntervalSeconds: 300,
+			MissPenalty:          0.15,
 		},
 		Backlink: BacklinkConfig{
 			Baseline: 200000,
@@ -208,6 +216,15 @@ func EntryURL() string { return Global.Crawler.EntryURL }
 // MaxPageSize 返回单个页面最大抓取字节数（0=不限）。
 func MaxPageSize() int { return Global.Crawler.MaxPageSize }

+// RecrawlMaxAge 返回 URL 过期时间（秒），超过此时间的 URL 允许被重爬。
+func RecrawlMaxAge() int { return Global.Crawler.RecrawlMaxAge }
+
+// RecrawlCheckInterval 返回运行期间检查过期 URL 的间隔（秒）。
+func RecrawlCheckInterval() int { return Global.Crawler.RecrawlCheckInterval }
+
+// RecrawlBatchSize 返回每次检查最多释放的过期 URL 数量。
+func RecrawlBatchSize() int { return Global.Crawler.RecrawlBatchSize }
+
 // UseOnlineSnippet 返回配置值
 func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }

@@ -232,6 +249,9 @@ func SearchServerPort() int { return Global.Search.ServerPort }
 // FlushIntervalSeconds 返回配置值
 func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds }

+// MissPenalty 返回缺词惩罚系数（0~1），值越大对缺少查询词的 URL 惩罚越重。
+func MissPenalty() float64 { return Global.Search.MissPenalty }
+
 // BacklinkBaseline 返回配置值
 func BacklinkBaseline() int { return Global.Backlink.Baseline }

@@ -7,6 +7,7 @@ import (
 	"context"       // context 超时控制
 	"encoding/json" // JSON 序列化（发送关键词数据到收获服务）
 	"fmt"           // 格式化（构造目标地址）
+	"hash/fnv"      // FNV 哈希（内容变化检测）
 	"log"           // 日志输出
 	"math"          // 数学运算（指数衰减、质量评分）
 	"math/rand"     // 随机数（加权采样、队列打乱）
@@ -93,14 +94,58 @@ func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *C
 }

 // warmVisited 从 DB 的 gate bucket 加载所有已缓存的 URL 到 visited set。
+// 超过 RecrawlMaxAge 的 URL 不加入 visited，使其可以被重新爬取。
 func (c *Crawler) warmVisited() {
 	count := 0
+	expired := 0
+	maxAge := int64(config.RecrawlMaxAge())
+	now := time.Now().Unix()
 	_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
-		c.visited[u] = true
+		if now-entry.Timestamp < maxAge {
+			c.visited[u] = true // 未过期，仍然跳过
 			count++
+		} else {
+			expired++
+		}
 		return nil
 	})
-	log.Printf("[crawler] visited set warmed: %d URLs loaded", count)
+	log.Printf("[crawler] visited set warmed: %d active, %d expired (eligible for recrawl)", count, expired)
+}
+
+// startRecrawlTicker 启动后台定时任务，定期扫描并释放过期 URL 回到候选池。
+// 已过期的 URL 从 visited map 中移除，使其可以在后续 BFS 轮次中被重新发现和爬取。
+func (c *Crawler) startRecrawlTicker() {
+	interval := config.RecrawlCheckInterval()
+	if interval <= 0 {
+		return // 未配置或禁用
+	}
+	go func() {
+		ticker := time.NewTicker(time.Duration(interval) * time.Second)
+		defer ticker.Stop()
+		for range ticker.C {
+			maxAge := int64(config.RecrawlMaxAge())
+			batchSize := config.RecrawlBatchSize()
+			now := time.Now().Unix()
+			removed := 0
+
+			c.visitedMu.Lock()
+			_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
+				if removed >= batchSize {
+					return fmt.Errorf("batch full") // 提前终止遍历
+				}
+				if now-entry.Timestamp >= maxAge && c.visited[u] {
+					delete(c.visited, u)
+					removed++
+				}
+				return nil
+			})
+			c.visitedMu.Unlock()
+
+			if removed > 0 {
+				log.Printf("[crawler] recrawl ticker: released %d expired URLs back to pool", removed)
+			}
+		}
+	}()
 }

 // markVisited 将 URL 标记为已访问（线程安全）。
@@ -154,6 +199,9 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 	c.markVisited(entryURL)
 	queue := []string{entryURL}

+	// 启动后台重爬定时器：定期释放过期 URL 到候选池
+	c.startRecrawlTicker()
+
 	for ep := 0; ep < maxEpoch; ep++ {
 		// 每轮 epoch 从 config 读取最新 workers 值，支持运行时动态调整
 		workers := config.CrawlerWorkers()
@@ -263,6 +311,17 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
 	// 解析 HTML：提取标题、描述、正文和所有超链接
 	title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)

+	// 计算正文内容哈希（FNV-1a），用于增量重爬检测
+	contentHash := fnvHash(text)
+
+	// 增量重爬检测：查询上次爬取的哈希，内容未变则跳过关键词提取
+	isRecrawl := false
+	oldEntry, _ := c.db.GetSnippet(res.FinalURL)
+	if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
+		isRecrawl = true
+		log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
+	}
+
 	// 缓存 URL 摘要（仅对短 URL 缓存，防止超长 URL 浪费空间）
 	if len(res.FinalURL) < 250 {
 		_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
@@ -270,10 +329,13 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
 			Description: truncate(desc, 256),
 			Text:        truncate(text, 256),
 			Timestamp:   time.Now().Unix(),
+			ContentHash: contentHash,
 		})
 	}

 	// 关键词提取：将标题/描述/正文交给 analyzer 计算关键词权重
+	// 增量优化：如果内容未变化（重爬），跳过关键词提取和索引更新
+	if !isRecrawl {
 		kws := c.analyzer.Analyze(title, desc, text)
 		if len(kws) > 0 {
 			// 限制每个页面最多发送的关键词数量
@@ -285,6 +347,7 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
 			// 异步发送到收获服务器写入倒排索引（不阻塞爬取流程）
 			go c.sendToHarvester(res.FinalURL, kws)
 		}
+	}

 	// 更新网站元信息（成功访问）
 	host := netloc(res.FinalURL)
@@ -671,6 +734,14 @@ func truncate(s string, n int) string {
 	return s[:n]
 }

+// fnvHash 使用 FNV-1a 算法计算字符串的哈希值（十六进制字符串）。
+// 用于增量重爬时检测页面正文是否发生变化。
+func fnvHash(s string) string {
+	h := fnv.New128a()
+	h.Write([]byte(s))
+	return fmt.Sprintf("%x", h.Sum(nil))
+}
+
 // sampleStrings 从字符串切片中随机不重复抽取 n 条。
 func sampleStrings(s []string, n int) []string {
 	if len(s) <= n {
@@ -5,8 +5,8 @@
    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>SESE 爬取管理</title>
-    <script type="module" crossorigin src="/assets/index-w20XarNx.js"></script>
-    <link rel="stylesheet" crossorigin href="/assets/index-CLWukEE8.css">
+    <script type="module" crossorigin src="/assets/index-CQNkho4R.js"></script>
+    <link rel="stylesheet" crossorigin href="/assets/index-D_FJuGDF.css">
  </head>
  <body>
    <div id="app"></div>
@@ -66,6 +66,10 @@ crawler:
  max_epoch: 100                   # BFS 爬取的最大轮次上限
  expected_prosper_ratio: 0.6      # 队列中预期"繁荣"域名（高反向链接）的占比，用于调度决策
  entry_url: "https://zh.wikipedia.org/"  # BFS 爬取的起始入口 URL
+  max_page_size: 5242880           # 单个页面最大抓取字节数（0=不限，默认 5MB）
+  recrawl_max_age: 2592000         # URL 过期时间（秒），超过此时间的 URL 允许被重爬，默认 30 天
+  recrawl_check_interval: 3600     # 运行期间检查过期 URL 的间隔（秒），默认 1 小时
+  recrawl_batch_size: 500          # 每次检查最多释放多少个过期 URL

 # 搜索结果排序权重配置
 search:
@@ -265,12 +265,18 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
 	langCount := make(map[string]int)
 	totalWords := 0
 	total := 0
+	recrawlEligible := 0
+	now := time.Now().Unix()
+	maxAge := int64(config.RecrawlMaxAge())

 	s.db.ForEachSnippet(func(url string, snippet *storage.SnippetEntry) error {
 		total++
 		domain := netloc(url)
 		domainCount[domain]++
 		totalWords += len(snippet.Text)
+		if now-snippet.Timestamp >= maxAge {
+			recrawlEligible++
+		}

 		siteInfo, _ := s.db.GetSiteInfo(domain)
 		if siteInfo != nil {
@@ -321,6 +327,7 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
 		"domains":           domainsMap,
 		"languages":         langsMap,
 		"pending":           atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数
+		"recrawl_eligible":  recrawlEligible,               // 已过期、可被重爬的 URL 数量
 	}

 	json.NewEncoder(w).Encode(resp)
@@ -575,7 +582,8 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
 		if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
 			siteFilter = m[1] // site:example.com 提取目标主机名
 		} else {
-			segs := s.analyzer.Segment(part, false)
+			// 搜索模式分词（CutForSearch）：更细粒度，"气象局" → ["气象", "局", "气象局"]
+			segs := s.analyzer.Segment(part, true)
 			for _, t := range segs {
 				if !s.infoSvc.IsBlocked(t) { // 过滤屏蔽词
 					tokens = append(tokens, t)
@@ -584,6 +592,11 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
 		}
 	}

+	// 智能去重：当一个词是另一个词的子串时，保留两者但标记子词关系。
+	// 这样评分时可以用长词（精确匹配）加分，同时不因缺少短词而过度惩罚。
+	// 算法：对于每一对词 (a, b)，如果 a 是 b 的子串且 a != b，则 a 是 b 的子词。
+	tokens = deduplicateSubstrings(tokens)
+
 	// 最多保留 20 个词（避免查询过于宽泛）
 	if len(tokens) > 20 {
 		tokens = tokens[:20]
@@ -691,10 +704,15 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
 	}

 	// 计算每个 URL 的相关性和初始分数
+	// 评分策略：部分匹配加权和 + 缺词软惩罚（替代原来的全词乘积）
+	// 全词乘积问题：一个 URL 只要缺少任何一个查询词，rel 就接近 0，
+	// 导致"气象局"拆成 ["气象局","局"] 后，只有"气象"没有"气象局"的页面被淹没。
+	missPenalty := config.MissPenalty()
 	candidates := make([]candidate, 0, len(urlWeights))
 	for u, vs := range urlWeights {
-		// 词权重相乘（贝叶斯概率近似），缺省权重填充缺失词
-		rel := 1.0
+		// 统计实际匹配的词数和权重总和
+		matchedCount := 0
+		sumWeight := 0.0
 		for _, ti := range tokenIndexes {
 			vp := vs[ti.token]
 			if vp == 0 {
@@ -703,8 +721,23 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
 			if vp > 0.06 {
 				vp = math.Log((vp-0.06)*40+1)/40 + 0.06
 			}
-			rel *= vp
+			sumWeight += vp
+			// 只有权重超过默认值才算真正匹配（排除了 defVal 填充的假匹配）
+			if vs[ti.token] > 0 {
+				matchedCount++
 			}
+		}
+		totalTokens := len(tokenIndexes)
+		// 部分匹配相关性 = 加权平均 × 匹配覆盖率加成
+		// matchRatio：匹配词占比，全部匹配=1，全部缺失=0
+		matchRatio := float64(matchedCount) / float64(totalTokens)
+		// avgWeight：匹配词的平均权重
+		avgWeight := sumWeight / float64(totalTokens)
+		// rel = 平均权重 × (匹配率 + 未匹配部分的软惩罚)
+		// missPenalty 控制未匹配词的惩罚力度：
+		//   0 = 完全不惩罚（纯加权和）
+		//   1 = 缺词的权重取 0（等同于全词乘积的极端情况）
+		rel := avgWeight * (matchRatio + (1-matchRatio)*missPenalty)
 		// 反向链接繁荣加分
 		prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight()
 		bad := badURL(u)
@@ -969,6 +1002,43 @@ func badURL(u string) float64 {
 	return math.Min(s, 0.9)
 }

+// deduplicateSubstrings 对分词结果进行智能去重。
+// 当词 A 是词 B 的子串时（A ≠ B），移除较短的 A。
+// 例如 ["气象", "局", "气象局"] → ["气象局", "局"]
+// 保留最长词以确保精确匹配优先，同时短词作为兜底召回。
+func deduplicateSubstrings(tokens []string) []string {
+	if len(tokens) <= 1 {
+		return tokens
+	}
+	// 按长度降序排列，等长按字典序
+	sort.Slice(tokens, func(i, j int) bool {
+		if len(tokens[i]) != len(tokens[j]) {
+			return len(tokens[i]) > len(tokens[j])
+		}
+		return tokens[i] < tokens[j]
+	})
+	seen := make(map[string]bool)
+	var result []string
+	for _, t := range tokens {
+		if seen[t] {
+			continue // 完全重复的词跳过
+		}
+		seen[t] = true
+		// 检查是否已被更长的词包含（t 是某个已保留词的子串）
+		isSubstr := false
+		for _, kept := range result {
+			if strings.Contains(kept, t) && kept != t {
+				isSubstr = true
+				break
+			}
+		}
+		if !isSubstr {
+			result = append(result, t)
+		}
+	}
+	return result
+}
+
 // netloc 从 URL 提取主机名。
 func netloc(rawURL string) string {
 	parts := strings.SplitN(rawURL, "/", 4)
@@ -30,12 +30,13 @@ type IndexEntry struct {
 }

 // SnippetEntry 是 URL 对应的摘要信息缓存。
-// 包含页面标题、描述、正文片段和抓取时间戳。
+// 包含页面标题、描述、正文片段、抓取时间戳和内容哈希（用于增量重爬检测）。
 type SnippetEntry struct {
 	Title       string `json:"title"`        // 网页标题
 	Description string `json:"desc"`         // meta description 或自动生成的描述
 	Text        string `json:"text"`         // 正文前 N 字符的文本片段
 	Timestamp   int64  `json:"ts"`           // 抓取该页面时的 Unix 时间戳
+	ContentHash string `json:"hash"`         // 正文内容的 FNV-1a 哈希（用于增量重爬判断内容是否变化）
 }

 // 四个 bbolt bucket 的名称（以字节数组存储，bbolt 要求 key/value 均为字节）