加上中文注释

2026-04-08 17:48:05 +08:00
parent 6c2f5ad978
commit c154abf410
11 changed files with 830 additions and 560 deletions
@@ -1,39 +1,39 @@
 // Package harvester implements the index-writing server (port 5000).
+// 收获服务器包：接收爬虫发送的关键词索引数据，批量写入 bbolt 持久化存储。
 //
-// It receives (url, keywords) payloads from the crawler, accumulates them in
-// memory, then flushes to the persistent inverted index when the in-memory
-// row count exceeds the configured threshold.
+// 工作流程：爬虫每抓取一个页面，将 (URL, 关键词列表) 通过 HTTP POST 发送到本服务；
+// 本服务先将数据积累在内存中，当内存中索引条目数量超过阈值时，批量合并到磁盘索引。
 package harvester

 import (
-	"encoding/json"
-	"log"
-	"math/rand"
-	"net/http"
-	"strings"
-	"sync"
-	"sync/atomic"
+	"encoding/json"  // JSON 反序列化（解析爬虫请求）
+	"log"          // 日志输出
+	"math/rand"    // 随机数（打乱合并顺序、触发概率性操作）
+	"net/http"     // HTTP 服务端
+	"strings"       // 字符串操作（URL 清洗）
+	"sync"         // 互斥锁（保护内存索引、防止并发刷盘）
+	"sync/atomic"  // 原子操作（计数器）

-	"sese-engine/config"
-	"sese-engine/info"
-	"sese-engine/storage"
+	"sese-engine/config"  // 全局配置（刷盘阈值、URL 上限）
+	"sese-engine/info"    // info 服务（查询繁荣分数用于裁剪）
+	"sese-engine/storage" // 持久化存储
 )

-// Server is the harvester HTTP server.
+// Server 是收获 HTTP 服务器，负责接收爬虫数据、内存聚合、批量写入。
 type Server struct {
 	db *storage.DB

-	// in-memory accumulator: keyword → [(weight, url)]
+	// 内存索引聚合器：关键词 → 该词关联的 [权重, URL] 条目列表
 	mem   map[string][]storage.IndexEntry
-	memMu sync.Mutex
+	memMu sync.Mutex // 保护内存索引的并发写入

-	rowCount int64   // approximate total in-memory rows
-	flushMu  sync.Mutex // only one flush at a time
+	rowCount int64          // 内存中累计的索引条目总数（用于触发刷盘）
+	flushMu  sync.Mutex     // 确保同一时刻只有一个 flush 在执行

-	infoSvc *info.Service
+	infoSvc *info.Service    // info 服务：用于查询繁荣分数来决定索引裁剪优先级
 }

-// New creates a harvester Server.
+// New 创建一个 harvester Server 实例。
 func New(db *storage.DB, infoSvc *info.Service) *Server {
 	return &Server{
 		db:      db,
@@ -42,22 +42,23 @@ func New(db *storage.DB, infoSvc *info.Service) *Server {
 	}
 }

-// ingestPayload is the JSON body sent by the crawler.
+// ingestPayload 是爬虫发送的 JSON 请求体结构。
 type ingestPayload struct {
-	URL      string `json:"url"`
+	URL      string `json:"url"`        // 被索引页面的最终 URL
 	Keywords []struct {
-		Word   string  `json:"word"`
-		Weight float32 `json:"weight"`
+		Word   string  `json:"word"`   // 关键词
+		Weight float32 `json:"weight"` // 该 URL 在该词下的权重
 	} `json:"keywords"`
 }

-// Handler returns the http.Handler for the harvester.
+// Handler 返回 HTTP 路由处理器。
 func (s *Server) Handler() http.Handler {
 	mux := http.NewServeMux()
-	mux.HandleFunc("/l", s.handleIngest)
+	mux.HandleFunc("/l", s.handleIngest) // /l 端点：接收爬虫数据
 	return mux
 }

+// handleIngest 处理爬虫发来的 POST 请求，将关键词数据写入内存索引。
 func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
 	if r.Method != http.MethodPost {
 		http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
@@ -69,7 +70,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
 		return
 	}

-	// Sanitise URL
+	// 清洗 URL：去除换行符（防止注入）
 	payload.URL = strings.ReplaceAll(payload.URL, "\n", "")
 	if payload.URL == "" {
 		http.Error(w, "empty url", http.StatusBadRequest)
@@ -81,7 +82,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
 		key := kw.Word
 		entries := s.mem[key]

-		// Threshold-based early discard
+		// 阈值提前过滤：若该词已有大量条目，则只接受权重足够高的新条目
 		if len(entries) > 15 {
 			low := s.lowThreshold(key)
 			if float64(kw.Weight) < low {
@@ -96,7 +97,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
 	}
 	s.memMu.Unlock()

-	// Check if we should flush
+	// 当内存条目数超过阈值时，异步触发刷盘
 	if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
 		go s.flush()
 	}
@@ -104,28 +105,32 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
 	w.Write([]byte("ok"))
 }

-// lowThreshold returns the minimum weight needed to enter the index for key.
+// lowThreshold 返回某关键词在已有大量条目时，新条目所需的最低权重阈值。
+// 计算方式：找到磁盘上该词第 MaxURLsPerKey 高权重值，取其 5% 作为阈值。
 func (s *Server) lowThreshold(key string) float64 {
 	existing, _ := s.db.GetIndex(key)
 	if len(existing) < config.MaxURLsPerKey {
-		return -1
+		return -1 // 未达上限，所有条目都接受
 	}
-	// Find the config.MaxURLsPerKey-th highest weight
+	// 收集所有权重值
 	weights := make([]float64, len(existing))
 	for i, e := range existing {
 		weights[i] = float64(e.Weight)
 	}
-	// Partial sort: find threshold at position MaxURLsPerKey-1
+	// 找第 MaxURLsPerKey-1 大的值（即准入门槛）
 	return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
 }

-// flush merges the in-memory accumulator into the persistent index.
+// flush 将内存中的索引批量合并写入磁盘，然后清空内存。
+// 整个过程：原子快照 → 并行合并 → 批量写入。
 func (s *Server) flush() {
+	// TryLock：若已有其他 flush 在执行则直接退出
 	if !s.flushMu.TryLock() {
-		return // another flush is running
+		return
 	}
 	defer s.flushMu.Unlock()

+	// 原子快照：取出当前内存数据并立即重置
 	s.memMu.Lock()
 	snapshot := s.mem
 	s.mem = make(map[string][]storage.IndexEntry)
@@ -134,6 +139,7 @@ func (s *Server) flush() {

 	log.Printf("[harvester] flushing %d keys", len(snapshot))

+	// 转换为切片便于处理，打乱顺序防止热点词优先处理导致堆积
 	items := make([]struct {
 		key     string
 		entries []storage.IndexEntry
@@ -146,13 +152,13 @@ func (s *Server) flush() {
 	}
 	rand.Shuffle(len(items), func(i, j int) { items[i], items[j] = items[j], items[i] })

-	// Parallel merge
+	// 并行合并：每个关键词独立合并到磁盘
 	type result struct {
 		key     string
 		entries []storage.IndexEntry
 	}
 	results := make(chan result, len(items))
-	sem := make(chan struct{}, 8)
+	sem := make(chan struct{}, 8) // 最多 8 个并发合并协程

 	for _, item := range items {
 		sem <- struct{}{}
@@ -163,36 +169,39 @@ func (s *Server) flush() {
 		}(item.key, item.entries)
 	}

-	// Collect
+	// 收集所有合并结果
 	batch := make(map[string][]storage.IndexEntry, len(items))
 	for range items {
 		r := <-results
 		batch[r.key] = r.entries
 	}

+	// 批量写入 bbolt（一次事务写入所有关键词）
 	if err := s.db.BatchSetIndex(batch); err != nil {
 		log.Printf("[harvester] flush write error: %v", err)
 	}
 	log.Printf("[harvester] flush done, %d keys written", len(batch))
 }

-// mergeKey merges new entries with existing index entries for a key.
+// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
+// 包含：去重 → 概率性 URL 归一化去重 → 超限时按繁荣分数裁剪。
 func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
 	existing, _ := s.db.GetIndex(key)

-	// Discard new key if too few URLs
+	// 新关键词：如果条目数过少则丢弃（避免索引质量下降）
 	if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
 		return nil
 	}

+	// 合并新旧条目
 	merged := dedup(append(newEntries, existing...))

-	// Occasional URL normalisation dedup
+	// 2% 概率执行 URL 归一化去重（去除 https/http 重复、尾部斜杠差异等）
 	if rand.Float64() < 0.02 {
 		merged = dedupNormalised(merged)
 	}

-	// Trim if over limit
+	// 超限或 2% 概率触发裁剪：按 (权重 × 繁荣分数) 排序后截断
 	if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
 		merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
 	}
@@ -200,8 +209,9 @@ func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage
 	return merged
 }

-// ---- helpers ----
+// ---- 辅助函数 ----

+// dedup 按 URL 完全匹配去重。
 func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
 	seen := make(map[string]bool, len(entries))
 	out := make([]storage.IndexEntry, 0, len(entries))
@@ -215,10 +225,12 @@ func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
 	return out
 }

+// dedupNormalised 按 URL 归一化去重（去除协议前缀和尾部斜杠后比较）。
+// 按 URL 长度降序排序后处理：短 URL 优先保留（更可能是规范 URL）。
 func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
-	// Sort by URL length descending, then dedup by normalised URL (strip scheme, trailing slash)
 	sorted := make([]storage.IndexEntry, len(entries))
 	copy(sorted, entries)
+	// 降序排列（简单冒泡）
 	for i := 0; i < len(sorted)-1; i++ {
 		for j := i + 1; j < len(sorted); j++ {
 			if len(sorted[j].URL) > len(sorted[i].URL) {
@@ -239,6 +251,7 @@ func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
 	return out
 }

+// normaliseURL 归一化 URL：去除协议前缀，尾部斜杠去除。
 func normaliseURL(u string) string {
 	if strings.HasPrefix(u, "https://") {
 		u = u[8:]
@@ -248,9 +261,10 @@ func normaliseURL(u string) string {
 	return strings.TrimRight(u, "/")
 }

-// trim reduces entries to at most limit, keeping at most sameDomainLimit per domain.
+// trim 将条目列表裁剪到指定上限，同时限制每个域名的最大条目数。
+// 排序依据：(权重 × (1 + 繁荣分数))，使高权重且高繁荣的 URL 优先保留。
 func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomainLimit int) []storage.IndexEntry {
-	// Sort by effective score: weight * (1 + backlink)
+	// 按综合分数降序排列
 	scored := make([]storage.IndexEntry, len(entries))
 	copy(scored, entries)
 	for i := 0; i < len(scored)-1; i++ {
@@ -263,7 +277,7 @@ func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomain
 		}
 	}

-	// Per-domain cap
+	// 按域名计数，每个域名最多保留 sameDomainLimit 条（首页 URL 不受限制）
 	domainCount := make(map[string]int)
 	out := make([]storage.IndexEntry, 0, limit)
 	for _, e := range scored {
@@ -272,8 +286,7 @@ func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomain
 			host = e.URL
 		}
 		host = strings.ToLower(host)
-		// Allow homepage URLs regardless of limit
-		isHome := isHomepage(e.URL)
+		isHome := isHomepage(e.URL) // 首页 URL 不受域名数量限制
 		if !isHome && domainCount[host] >= sameDomainLimit {
 			continue
 		}
@@ -286,12 +299,14 @@ func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomain
 	return out
 }

+// isHomepage 判断 URL 是否为网站首页（不含路径层级）。
 func isHomepage(u string) bool {
 	u = strings.TrimPrefix(u, "https://")
 	u = strings.TrimPrefix(u, "http://")
 	return strings.Count(strings.TrimRight(u, "/"), "/") == 0
 }

+// netloc 从 URL 提取主机名（简化版，不依赖 net/url）。
 func netloc(rawURL string) string {
 	parts := strings.SplitN(rawURL, "/", 4)
 	if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
@@ -300,14 +315,15 @@ func netloc(rawURL string) string {
 	return ""
 }

-// nthLargest returns the n-th largest value in a slice (0-indexed).
+// nthLargest 返回切片中第 n 大的值（0-indexed，即找第 n+1 大的值）。
+// 用于获取准入权重阈值。
 func nthLargest(values []float64, n int) float64 {
 	if n >= len(values) {
 		return 0
 	}
 	cp := make([]float64, len(values))
 	copy(cp, values)
-	// Partial sort descending
+	// 部分排序：只需将前 n+1 项排好序
 	for i := 0; i <= n; i++ {
 		maxIdx := i
 		for j := i + 1; j < len(cp); j++ {
@@ -320,7 +336,7 @@ func nthLargest(values []float64, n int) float64 {
 	return cp[n]
 }

-// ListenAndServe starts the harvester on the given address.
+// ListenAndServe 启动收获服务器在指定地址监听。
 func (s *Server) ListenAndServe(addr string) error {
 	log.Printf("[harvester] listening on %s", addr)
 	return http.ListenAndServe(addr, s.Handler())