加上中文注释

2026-04-08 17:48:05 +08:00
parent 6c2f5ad978
commit c154abf410
11 changed files with 830 additions and 560 deletions
@@ -1,43 +1,44 @@
 // crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
+// crawler 包的主逻辑：BFS 爬取循环、URL 调度算法、网站元信息更新。
 package crawler

 import (
-	"bytes"
-	"encoding/json"
-	"log"
-	"math"
-	"math/rand"
-	"net/http"
-	"net/url"
-	"strings"
-	"sync"
-	"sync/atomic"
-	"time"
+	"bytes"          // 字节缓冲（构造 HTTP POST 请求体）
+	"encoding/json"  // JSON 序列化（发送关键词数据到 harvester）
+	"log"            // 日志输出
+	"math"           // 数学运算（指数衰减、质量评分）
+	"math/rand"      // 随机数（加权采样、队列打乱）
+	"net/http"       // HTTP 客户端（POST 数据到 harvester）
+	"net/url"        // URL 解析
+	"strings"        // 字符串操作
+	"sync"           // 互斥锁（保护并发收集结果）
+	"sync/atomic"    // 原子操作（计数器，无锁并发更新）
+	"time"           // 时间戳

-	"sese-engine/analyzer"
-	"sese-engine/config"
-	"sese-engine/parser"
-	"sese-engine/storage"
+	"sese-engine/analyzer" // 文本分析和关键词提取
+	"sese-engine/config"    // 全局配置常量
+	"sese-engine/parser"    // HTML 解析（提取标题、正文、链接）
+	"sese-engine/storage"   // 持久化存储
 )

-
-// Stats holds real-time crawl counters (read with atomic).
+// Stats 存放爬虫实时统计计数器（使用 atomic 原子读取）。
 type Stats struct {
-	VisitedURLs    int64
-	SuccessURLs    int64
-	KeywordsFetched int64
+	VisitedURLs     int64 // 已访问的 URL 总数（含失败）
+	SuccessURLs     int64 // 成功抓取（HTTP 200）的 URL 数
+	KeywordsFetched int64 // 累计提取的关键词总数
 }

-// Crawler orchestrates the BFS crawl.
+// Crawler 编排整个 BFS 爬取流程。
 type Crawler struct {
-	fetcher    *Fetcher
-	db         *storage.DB
-	analyzer   *analyzer.Analyzer
-	prosperMap map[string]float64 // domain → backlink score (loaded from info)
-	stats      Stats
+	fetcher    *Fetcher             // HTTP 抓取器（含 robots.txt 和限流）
+	db         *storage.DB          // 持久化数据库
+	analyzer   *analyzer.Analyzer   // 分词和关键词分析
+	prosperMap map[string]float64   // 域名 → 反向链接繁荣值（来自 info 模块，越大越"有价值"）
+	stats      Stats                // 原子计数器
 }

-// New creates a Crawler.
+// New 创建一个 Crawler 实例。
+// prosperMap 由 info 模块加载，传入域名繁荣值用于调度优先级计算。
 func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
 	return &Crawler{
 		fetcher:    NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
@@ -47,40 +48,46 @@ func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *C
 	}
 }

-// URLWeight pairs a URL with its discovery weight.
+// URLWeight 将 URL 和发现权重打包在一起，用于调度决策。
 type URLWeight struct {
-	URL    string
-	Weight float64
+	URL    string   // 待访问的 URL
+	Weight float64  // 发现权重（从父页面分得的"关注度"，页面链接越多则每个分得越少）
 }

-// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
-// It blocks until completion.
+// Run 启动 BFS 爬取，从 entryURL 开始，执行最多 maxEpoch 轮。
+// 各轮之间是串行的，每轮内并发抓取，按调度算法选择下一轮 URL。
 func (c *Crawler) Run(entryURL string, maxEpoch int) {
-	visited := make(map[string]bool)
-	queue := []string{entryURL}
+	visited := make(map[string]bool) // 已访问 URL 集合（防止重复抓取）
+	queue := []string{entryURL}      // 当前轮次的待抓取队列

 	for ep := 0; ep < maxEpoch; ep++ {
 		log.Printf("[crawler] epoch %d/%d  queue=%d", ep+1, maxEpoch, len(queue))
+		// 将本轮所有 URL 标记为已访问（防止下一轮重复入队）
 		for _, u := range queue {
 			visited[u] = true
 		}

+		// 并发抓取本轮所有 URL
 		var (
-			newLinks []URLWeight
-			mu       sync.Mutex
+			newLinks []URLWeight // 收集下一轮候选 URL
+			mu       sync.Mutex  // 保护 newLinks 的并发写入
 			wg       sync.WaitGroup
 		)

+		// 信号量：限制同时并发数不超过配置的工作线程数
 		sem := make(chan struct{}, config.CrawlerWorkers)
 		for _, u := range queue {
 			wg.Add(1)
-			sem <- struct{}{}
+			sem <- struct{}{} // 获取一个令牌（阻塞直到有空闲槽位）
 			go func(rawURL string) {
 				defer wg.Done()
-				defer func() { <-sem }()
+				defer func() { <-sem }() // 释放令牌
+
+				// 抓取单个 URL，返回发现的子链接
 				hrefs := c.visitURL(rawURL)
 				n := len(hrefs)
 				if n > 0 {
+					// 每个子链接分得 1/n 的父页面权重
 					w := 1.0 / float64(n)
 					mu.Lock()
 					for _, h := range hrefs {
@@ -94,30 +101,34 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 		}
 		wg.Wait()

+		// 本轮没有发现新链接，爬取结束
 		if len(newLinks) == 0 {
 			log.Println("[crawler] empty queue — stopping")
 			return
 		}

+		// 调度算法：从候选 URL 中选出下一轮要抓取的队列
 		queue = c.schedule(newLinks)
 	}
 }

-// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
+// visitURL 抓取一个 URL，提取关键词、缓存摘要、更新网站元信息，返回页面中发现的子链接。
 func (c *Crawler) visitURL(rawURL string) []string {
-	atomic.AddInt64(&c.stats.VisitedURLs, 1)
+	atomic.AddInt64(&c.stats.VisitedURLs, 1) // 计数器 +1

+	// 礼貌模式抓取（遵守 robots.txt + 限流），超时 10 秒，不限制大小
 	res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
 	if err != nil || res == nil {
-		c.updateSiteFailure(rawURL)
+		c.updateSiteFailure(rawURL) // 记录失败，更新该网站成功率
 		return nil
 	}

-	atomic.AddInt64(&c.stats.SuccessURLs, 1)
+	atomic.AddInt64(&c.stats.SuccessURLs, 1) // 成功计数器 +1

+	// 解析 HTML：提取标题、描述、正文和所有超链接
 	title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)

-	// Cache snippet
+	// 缓存 URL 摘要（仅对短 URL 缓存，防止超长 URL 浪费空间）
 	if len(res.FinalURL) < 250 {
 		_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
 			Title:       title,
@@ -127,21 +138,23 @@ func (c *Crawler) visitURL(rawURL string) []string {
 		})
 	}

-	// Keyword extraction → send to harvester
+	// 关键词提取：将标题/描述/正文交给 analyzer 计算关键词权重
 	kws := c.analyzer.Analyze(title, desc, text)
 	if len(kws) > 0 {
+		// 限制每个页面最多发送的关键词数量
 		if len(kws) > config.MaxKeywordsPerPage {
 			kws = kws[:config.MaxKeywordsPerPage]
 		}
 		atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
+		// 异步发送到收获服务器写入倒排索引（不阻塞爬取流程）
 		go c.sendToHarvester(res.FinalURL, kws)
 	}

-	// Update site info
+	// 更新网站元信息（成功访问）
 	host := netloc(res.FinalURL)
 	c.updateSiteSuccess(host, res, title, desc, text, hrefs)

-	// Handle permanent redirects in site info
+	// 处理永久重定向：更新源主机名下的重定向映射
 	for from, to := range res.Redirects {
 		fromHost := netloc(from)
 		if fromHost == "" {
@@ -152,20 +165,21 @@ func (c *Crawler) visitURL(rawURL string) []string {
 			info.Redirects = make(map[string]string)
 		}
 		info.Redirects[from] = to
+		// 重定向映射过多时裁剪到 40 条
 		if len(info.Redirects) > 50 {
-			// keep most important (just truncate randomly for now)
 			info.Redirects = truncateMap(info.Redirects, 40)
 		}
 		_ = c.db.SetSiteInfo(fromHost, info)
 	}

-	// Trim hrefs
+	// 限制返回的链接数，防止下一轮队列爆炸
 	if len(hrefs) > 100 {
 		hrefs = sampleStrings(hrefs, 100)
 	}
 	return hrefs
 }

+// updateSiteFailure 当某 URL 抓取失败时，更新该网站的访问成功率（指数衰减）。
 func (c *Crawler) updateSiteFailure(rawURL string) {
 	host := netloc(rawURL)
 	if host == "" {
@@ -176,27 +190,33 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
 		zero := 0.0
 		info.SuccessRate = &zero
 	}
+	// 成功率每次失败乘以 0.99（无限趋近 0）
 	*info.SuccessRate *= 0.99
 	_ = c.db.SetSiteInfo(host, info)
 }

+// updateSiteSuccess 当某 URL 抓取成功时，更新网站的完整元信息。
 func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
 	info, _ := c.db.GetSiteInfo(host)

+	// 访问计数 +1，更新最后访问时间
 	info.VisitCount++
 	info.LastVisitTime = time.Now().Unix()

+	// 成功率更新：EWM（指数加权移动）平滑，每次 +0.01
 	one := 1.0
 	if info.SuccessRate == nil {
 		info.SuccessRate = &one
 	}
 	*info.SuccessRate = *info.SuccessRate*0.99 + 0.01

+	// 记录是否支持 HTTPS
 	if strings.HasPrefix(res.FinalURL, "https://") {
 		t := true
 		info.HTTPSAvailable = &t
 	}

+	// 记录 HTTP Server 类型（去重，保留最近 5 个）
 	if res.ServerType != "" {
 		found := false
 		for _, s := range info.ServerTypes {
@@ -213,20 +233,22 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
 		}
 	}

-	// Language detection — sample 10% or first 10 visits
+	// 语言检测和出站链接收集（仅在前 10 次访问或 10% 概率下触发，减少开销）
 	if info.VisitCount < 10 || rand.Float64() < 0.1 {
 		lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
 		if lang != "" {
 			if info.Languages == nil {
 				info.Languages = make(map[string]float64)
 			}
+			// 首次访问强度高，随访问次数增加强度衰减
 			intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
 			for k := range info.Languages {
-				info.Languages[k] *= (1 - intensity)
+				info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
 			}
-			info.Languages[lang] += intensity
+			info.Languages[lang] += intensity // 新语种增加
 		}
-		// Collect external links
+
+		// 收集外链（跨顶级域名的链接）
 		superHost := superNetloc(res.FinalURL)
 		var external []string
 		for _, h := range hrefs {
@@ -234,8 +256,10 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
 				external = append(external, h)
 			}
 		}
+		// 最多保留 10 条外链
 		sampled := sampleStrings(external, 10)
 		info.OutLinks = append(info.OutLinks, sampled...)
+		// 外链超过 250 条时采样到 200 条
 		if len(info.OutLinks) > 250 {
 			info.OutLinks = sampleStrings(info.OutLinks, 200)
 		}
@@ -244,10 +268,10 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
 	_ = c.db.SetSiteInfo(host, info)
 }

-// sendToHarvester POSTs keyword data to the harvester service.
+// sendToHarvester 将关键词索引数据通过 HTTP POST 发送到收获服务器（:5000/l 端点）。
 func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
 	type payload struct {
-		URL      string           `json:"url"`
+		URL      string             `json:"url"`
 		Keywords []analyzer.Keyword `json:"keywords"`
 	}
 	p := payload{URL: finalURL, Keywords: kws}
@@ -263,13 +287,15 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
 	resp.Body.Close()
 }

-// schedule selects and prioritises the next BFS queue from raw discovered links.
+// schedule 从候选 URL 集合中选出下一轮 BFS 队列。
+// 包含：域名集中度过滤、HTTP/HTTPS 比例控制、繁荣 URL 占比控制、加权随机采样。
 func (c *Crawler) schedule(links []URLWeight) []string {
+	// 候选过多时先随机采样到 10 万条，防止内存爆炸
 	if len(links) > 100000 {
 		links = sampleURLWeights(links, 100000)
 	}

-	// Pre-fetch site info for all involved domains
+	// 预加载所有涉及的网站信息（加速后续评分计算）
 	domains := make(map[string]bool)
 	for _, lw := range links {
 		if h := netloc(lw.URL); h != "" {
@@ -294,20 +320,20 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 	}
 	wg.Wait()

-	// Score each URL
+	// 对所有候选 URL 逐一计算调度优先级分数
 	scored_list := make([]scoredURL, len(links))
 	for i, lw := range links {
 		scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
 	}

-	// Weighted random sample (45000 or 1/3+250 whichever smaller)
+	// 加权随机采样：从高分到低分按权重概率抽取最多 k 条
 	k := min(45000, len(scored_list)/3+250)
 	selected := weightedSample(scored_list, k)

-	// Domain concentration filtering
+	// 域名集中度过滤：限制每个域名被选中的数量，防止被少数网站垄断
 	selected = concentrationFilter(selected, config.CrawlFocus)

-	// Separate https/http, cap http at 1/4 of https count
+	// 分离 HTTPS 和 HTTP 链接，HTTP 最多占 HTTPS 的 1/4
 	var httpsURLs, httpURLs []string
 	for _, s := range selected {
 		if strings.HasPrefix(s, "https://") {
@@ -321,7 +347,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 		httpURLs = sampleStrings(httpURLs, maxHTTP)
 	}

-	// Separate prosperous / non-prosperous
+	// 分离繁荣（高反向链接）域名和普通域名，按比例控制繁荣 URL 占比
 	var prosperURLs, otherURLs []string
 	for _, u := range append(httpsURLs, httpURLs...) {
 		if c.prosperMap[netloc(u)] > 0 {
@@ -330,6 +356,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 			otherURLs = append(otherURLs, u)
 		}
 	}
+	// 根据目标繁荣占比计算普通 URL 应保留数量
 	n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
 	if len(otherURLs) > n {
 		keep := max(len(otherURLs)-len(selected)/10, n)
@@ -338,12 +365,14 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 		}
 	}

+	// 合并并随机打乱（使繁荣 URL 和普通 URL 混合）
 	result := append(prosperURLs, otherURLs...)
 	rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
 	return result
 }

-// scoreURL computes the scheduling priority for a URL.
+// scoreURL 计算单个 URL 的调度优先级分数。
+// 综合考虑：中文语种权重、域名访问历史衰减、网站质量评分、繁荣值、URL 本身质量。
 func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
 	host := netloc(lw.URL)
 	super := superNetloc(lw.URL)
@@ -353,7 +382,7 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
 		info = &storage.SiteInfo{}
 	}

-	// Chinese-ness
+	// 中文倾向性：该网站中文内容占比
 	var chineseness float64 = 0.5
 	if len(info.Languages) > 0 {
 		total := 0.0
@@ -365,12 +394,13 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
 		}
 	}

-	// Interest decay based on visit count
+	// 兴趣衰减：基于访问次数的指数衰减，繁荣域名可访问更多次
 	prosper := math.Min(62, c.prosperMap[host])
 	limit := prosper*500 + 50
 	b := math.Pow(0.1, 1/limit)
 	interest := math.Pow(b, float64(info.VisitCount))

+	// 同理对顶级域名计算衰减（二级域名不够用时看顶级域名）
 	var interest2 float64 = 1.0
 	if super != host {
 		superInfo := siteCache[super]
@@ -381,23 +411,28 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
 		}
 	}

+	// 网站质量评分
 	quality := 1.0
 	if info.Quality != nil {
 		quality = *info.Quality
 	}

+	// 繁荣值加分（log 变换平滑）
 	prosperity := prosper
 	if prosperity > 0 {
 		prosperity += 0.5
 	}
 	prosperity = math.Log2(2+prosperity) + 1

+	// URL 本身的质量惩罚（过长、路径过深、使用 .php/.htm 等）
 	bad := badURL(lw.URL)
 	return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
 }

-// ---- helper functions ----
+// ---- 辅助函数 ----

+// netloc 从原始 URL 字符串提取主机名（不含路径）。
+// 支持 http:// 和 https:// 前缀，自动处理 URL 解析异常。
 func netloc(rawURL string) string {
 	parts := strings.SplitN(rawURL, "/", 4)
 	if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
@@ -410,7 +445,8 @@ func netloc(rawURL string) string {
 	return u.Host
 }

-// superNetloc returns "domain.tld" (strips subdomains).
+// superNetloc 返回顶级域名（去除子域名），例如 "www.example.com" → "example.com"。
+// 用于识别跨子域名但同主站的情况。
 func superNetloc(rawURL string) string {
 	host := netloc(rawURL)
 	parts := strings.Split(host, ".")
@@ -420,20 +456,26 @@ func superNetloc(rawURL string) string {
 	return host
 }

+// badURL 返回 URL 的"劣质"评分（0~0.9），基于长度、路径深度、文件扩展名等特征。
 func badURL(u string) float64 {
+	// URL 过长惩罚
 	s := math.Max(0, float64(len(u)-30)/200.0)
+	// 使用 .htm/.php 等动态页面惩罚
 	if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
 		s += (1 - s) * 0.3
 	}
+	// 路径层级过深惩罚（超过 2 层斜杠）
 	if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
 		s += (1 - s) * 0.1
 	}
+	// 极短 URL 或协议后冒号（如 ftp:）惩罚
 	if len(u) < 5 || u[4] == ':' {
 		s += (1 - s) * 0.3
 	}
 	return math.Min(s, 0.9)
 }

+// truncate 将字符串截断到最多 n 个字符。
 func truncate(s string, n int) string {
 	if len(s) <= n {
 		return s
@@ -441,6 +483,7 @@ func truncate(s string, n int) string {
 	return s[:n]
 }

+// sampleStrings 从字符串切片中随机不重复抽取 n 条。
 func sampleStrings(s []string, n int) []string {
 	if len(s) <= n {
 		return s
@@ -453,6 +496,7 @@ func sampleStrings(s []string, n int) []string {
 	return out
 }

+// sampleURLWeights 与 sampleStrings 相同，但处理 URLWeight 切片。
 func sampleURLWeights(s []URLWeight, n int) []URLWeight {
 	if len(s) <= n {
 		return s
@@ -465,11 +509,14 @@ func sampleURLWeights(s []URLWeight, n int) []URLWeight {
 	return out
 }

+// scoredURL 内部用结构体，存储 URL 和对应调度分数。
 type scoredURL struct {
 	url   string
 	score float64
 }

+// weightedSample 加权随机采样（不放回）：从 scoredURL 列表中按权重概率抽取最多 k 条。
+// 使用累积概率法近似 alias method（适合中等规模数据）。
 func weightedSample(items []scoredURL, k int) []string {
 	if k >= len(items) {
 		out := make([]string, len(items))
@@ -478,7 +525,6 @@ func weightedSample(items []scoredURL, k int) []string {
 		}
 		return out
 	}
-	// Simple weighted sampling without replacement using alias method approximation
 	totalWeight := 0.0
 	for _, s := range items {
 		totalWeight += s.score
@@ -486,6 +532,7 @@ func weightedSample(items []scoredURL, k int) []string {
 	selected := make(map[int]bool)
 	out := make([]string, 0, k)
 	for len(out) < k && len(selected) < len(items) {
+		// 随机取 [0, totalWeight) 区间的一个点
 		r := rand.Float64() * totalWeight
 		cum := 0.0
 		for i, s := range items {
@@ -496,7 +543,7 @@ func weightedSample(items []scoredURL, k int) []string {
 			if cum >= r {
 				selected[i] = true
 				out = append(out, s.url)
-				totalWeight -= s.score
+				totalWeight -= s.score // 被选中后从总权重中移除（不放回）
 				break
 			}
 		}
@@ -504,7 +551,10 @@ func weightedSample(items []scoredURL, k int) []string {
 	return out
 }

+// concentrationFilter 域名集中度过滤。
+// 按 CrawlFocus 因子限制每个顶级域名被选中的 URL 数量，防止爬取过于集中在少数网站。
 func concentrationFilter(urls []string, k float64) []string {
+	// 按顶级域名分组
 	domainGroups := make(map[string][]string)
 	shuffled := make([]string, len(urls))
 	copy(shuffled, urls)
@@ -515,13 +565,14 @@ func concentrationFilter(urls []string, k float64) []string {
 		domainGroups[d] = append(domainGroups[d], u)
 	}

+	// 计算每组保留上限：域名规模越大允许越多，但按 k 次幂压制
 	limit := 10
 	if len(domainGroups) > 1 {
 		sizes := make([]int, 0, len(domainGroups))
 		for _, g := range domainGroups {
 			sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
 		}
-		// sort sizes ascending, drop last (largest)
+		// 升序排列，去除最大一项，用其余项总和的 60% 作为全局上限
 		for i := 0; i < len(sizes)-1; i++ {
 			for j := i + 1; j < len(sizes)-1; j++ {
 				if sizes[j] < sizes[i] {
@@ -536,6 +587,7 @@ func concentrationFilter(urls []string, k float64) []string {
 		limit = max(10, int(float64(total)*0.6))
 	}

+	// 从每组中按计算的上限采样
 	var result []string
 	for _, g := range domainGroups {
 		sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
@@ -548,6 +600,7 @@ func concentrationFilter(urls []string, k float64) []string {
 	return result
 }

+// truncateMap 将 map 裁剪到最多 n 条（取前 n 条，无特定顺序）。
 func truncateMap(m map[string]string, n int) map[string]string {
 	if len(m) <= n {
 		return m
@@ -564,6 +617,7 @@ func truncateMap(m map[string]string, n int) map[string]string {
 	return out
 }

+// min 返回两个整数中的较小值。
 func min(a, b int) int {
 	if a < b {
 		return a
@@ -571,6 +625,7 @@ func min(a, b int) int {
 	return b
 }

+// max 返回两个整数中的较大值。
 func max(a, b int) int {
 	if a > b {
 		return a
@@ -578,7 +633,7 @@ func max(a, b int) int {
 	return b
 }

-// Expose Stats for monitoring.
+// GetStats 返回当前爬虫统计快照（用于监控）。
 func (c *Crawler) GetStats() Stats {
 	return Stats{
 		VisitedURLs:     atomic.LoadInt64(&c.stats.VisitedURLs),
@@ -1,64 +1,71 @@
 // Package crawler implements the HTTP fetching layer with robots.txt compliance,
 // per-host rate limiting, redirect tracking, and encoding detection.
+// crawler 包负责 HTTP 请求层：遵守 robots.txt、主机限流、追踪重定向、自动检测字符集。
 package crawler

 import (
-	"fmt"
-	"io"
-	"net/http"
-	"net/url"
-	"strings"
-	"sync"
-	"time"
+	"fmt"       // 字符串格式化（构建 robots.txt URL、错误信息）
+	"io"        // IO 接口（读取响应体）
+	"net/http"  // HTTP 客户端
+	"net/url"   // URL 解析
+	"strings"   // 字符串操作
+	"sync"      // 互斥锁（保护限流表和 robots.txt 缓存）
+	"time"      // 时间（限流间隔计算、robots.txt 缓存过期）

-	"golang.org/x/net/html/charset"
+	"golang.org/x/net/html/charset" // HTML 字符集自动检测（将各种编码转为 UTF-8）
 )

-// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
+// ErrCrawl 表示爬取过程中的预期错误（404、被 robots.txt 禁止、非 HTML 类型等）。
+// 此类错误由 FetchSafe 静默丢弃（返回 nil, nil）。
 type ErrCrawl struct {
-	Msg string
+	Msg string // 错误描述文本
 }

+// Error 实现 error 接口，返回错误描述。
 func (e *ErrCrawl) Error() string { return e.Msg }

-// FetchResult bundles the result of a successful fetch.
+// FetchResult 封装一次成功抓取的完整结果。
 type FetchResult struct {
-	Body        string            // decoded HTML body
-	FinalURL    string            // URL after redirects
-	Redirects   map[string]string // permanent redirects: from → to
-	ServerType  string
+	Body        string            // 解码后的 HTML 正文（UTF-8）
+	FinalURL    string            // 经过所有重定向后的最终 URL
+	Redirects   map[string]string // 永久重定向（301/308）映射：原始 URL → 最终 URL
+	ServerType  string            // HTTP Server 响应头（如 "nginx/1.18"）
 }

-// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
+// Fetcher 是一个可复用的 HTTP 客户端，内置 robots.txt 合规检查和按主机限流。
 type Fetcher struct {
-	client    *http.Client
-	userAgent string
-	cooldown  time.Duration
+	client    *http.Client // HTTP 客户端（包含重定向和超时控制）
+	userAgent string       // HTTP 请求的 User-Agent 头
+	cooldown  time.Duration // 同一主机相邻两次请求的最小间隔

-	rateMu   sync.Mutex
-	lastHit  map[string]time.Time // host → last request time
+	rateMu   sync.Mutex          // 保护 lastHit 限流表的互斥锁
+	lastHit  map[string]time.Time // 主机名 → 上次请求时间（用于计算限流等待）

-	robotsMu sync.Mutex
-	robots   map[string]*robotsEntry // host → parsed robots
+	robotsMu sync.Mutex                      // 保护 robots 缓存的互斥锁
+	robots   map[string]*robotsEntry // 主机名 → 该主机的 robots.txt 解析结果（含缓存时间）
 }

+// robotsEntry 缓存单台主机的 robots.txt 解析结果。
 type robotsEntry struct {
-	rules     []robotsRule
-	fetchedAt time.Time
+	rules     []robotsRule // 解析后的规则列表
+	fetchedAt time.Time    // 缓存时间（用于判断是否过期，24h 后重新抓取）
 }

+// robotsRule 一条 robots.txt 规则，对应一个 User-Agent 块。
 type robotsRule struct {
-	userAgent string
-	disallow  []string
-	allow     []string
+	userAgent string   // 适用的爬虫名称（"*" 表示全部）
+	disallow  []string // Disallow 路径列表
+	allow     []string // Allow 路径列表（优先于 disallow）
 }

-// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
+// NewFetcher 创建一个新的 Fetcher 实例。
+// userAgent：发出的 HTTP 请求的 User-Agent；cooldown：同一主机相邻请求的最小间隔。
 func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
 	return &Fetcher{
 		client: &http.Client{
-			Timeout: 30 * time.Second,
+			Timeout: 30 * time.Second, // 默认单次请求超时 30 秒
 			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				// 跟随重定向最多 10 次，防止重定向循环
 				if len(via) >= 10 {
 					return fmt.Errorf("too many redirects")
 				}
@@ -67,34 +74,37 @@ func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
 		},
 		userAgent: userAgent,
 		cooldown:  cooldown,
-		lastHit:   make(map[string]time.Time),
-		robots:    make(map[string]*robotsEntry),
+		lastHit:   make(map[string]time.Time), // 限流表初始化
+		robots:    make(map[string]*robotsEntry), // robots.txt 缓存初始化
 	}
 }

-// Fetch fetches url, respecting robots.txt and rate limits.
-// polite=false skips both checks (used by search server snippet fetcher).
+// Fetch 抓取指定 URL，遵守 robots.txt 和主机限流。
+// polite=false 时跳过 robots.txt 检查和限流（用于搜索服务在线抓摘要）。
 func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
 	return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
 }

-// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
+// FetchSafe 封装 Fetch，在遇到预期爬取错误（404/disallowed/非 HTML）时返回 (nil, nil)。
+// 调用方无需区分错误类型，直接跳过即可。
 func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
 	res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
 	if _, ok := err.(*ErrCrawl); ok {
-		return nil, nil
+		return nil, nil // 预期错误，静默丢弃
 	}
 	return res, err
 }

-// fetchWithHistory does the actual request and populates redirect history.
+// fetchWithHistory 执行实际 HTTP 请求，追踪永久重定向。
 func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	// 解析 URL 提取主机名
 	parsed, err := url.Parse(rawURL)
 	if err != nil {
 		return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
 	}
 	host := parsed.Host

+	// polite 模式：先限流，再检查 robots.txt
 	if polite {
 		f.rateLimit(host)
 		if !f.robotsAllowed(rawURL, host) {
@@ -102,6 +112,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
 		}
 	}

+	// 追踪永久重定向（301/308）
 	redirects := make(map[string]string)
 	client := &http.Client{
 		Timeout: timeout,
@@ -109,6 +120,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
 			if len(via) >= 10 {
 				return fmt.Errorf("too many redirects")
 			}
+			// 记录永久重定向
 			if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
 				from := via[len(via)-1].URL.String()
 				to := req.URL.String()
@@ -118,26 +130,32 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
 		},
 	}

+	// 构造 GET 请求
 	req, _ := http.NewRequest("GET", rawURL, nil)
 	req.Header.Set("User-Agent", f.userAgent)

+	// 发送请求
 	resp, err := client.Do(req)
 	if err != nil {
 		return nil, err
 	}
-	defer resp.Body.Close()
+	defer resp.Body.Close() // 读取完毕后关闭响应体

+	// 检查 HTTP 状态码
 	if resp.StatusCode == 404 {
 		return nil, &ErrCrawl{Msg: "404 not found"}
 	}
 	if resp.StatusCode >= 400 {
 		return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
 	}
+
+	// 检查 Content-Type，必须是 HTML 才继续
 	ct := resp.Header.Get("Content-Type")
 	if !strings.Contains(ct, "text/html") {
 		return nil, &ErrCrawl{Msg: "not html: " + ct}
 	}

+	// 解码响应体（自动检测字符集转为 UTF-8）
 	body, err := decodeBody(resp.Body, ct, sizeLimit)
 	if err != nil {
 		return nil, err
@@ -145,19 +163,20 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura

 	return &FetchResult{
 		Body:       body,
-		FinalURL:   resp.Request.URL.String(),
-		Redirects:  redirects,
+		FinalURL:   resp.Request.URL.String(), // 重定向后的最终 URL
+		Redirects:   redirects,
 		ServerType: resp.Header.Get("Server"),
 	}, nil
 }

-// rateLimit sleeps if the last request to host was too recent.
+// rateLimit 检查并强制执行主机限流：若距上次请求不足 cooldown 秒则 sleep 等待。
 func (f *Fetcher) rateLimit(host string) {
 	f.rateMu.Lock()
 	last, ok := f.lastHit[host]
 	now := time.Now()
 	f.lastHit[host] = now
-	// Periodically prune the map
+
+	// 限流表超过 10000 条时清理两倍 cooldown 时间之前的过期项，防止内存泄漏
 	if len(f.lastHit) > 10000 {
 		cutoff := now.Add(-f.cooldown * 2)
 		for k, v := range f.lastHit {
@@ -168,6 +187,7 @@ func (f *Fetcher) rateLimit(host string) {
 	}
 	f.rateMu.Unlock()

+	// 计算需要等待的时间
 	if ok {
 		elapsed := now.Sub(last)
 		if elapsed < f.cooldown {
@@ -176,12 +196,14 @@ func (f *Fetcher) rateLimit(host string) {
 	}
 }

-// robotsAllowed returns true if rawURL is crawlable.
+// robotsAllowed 根据 robots.txt 规则判断某 URL 是否允许爬取。
 func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
+	// 尝试从缓存读取（加锁保护）
 	f.robotsMu.Lock()
 	entry, ok := f.robots[host]
 	f.robotsMu.Unlock()

+	// 缓存不存在或已过期（超过 24 小时）则重新抓取并解析
 	if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
 		entry = f.fetchRobots(host, rawURL)
 		f.robotsMu.Lock()
@@ -189,6 +211,7 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
 		f.robotsMu.Unlock()
 	}

+	// 解析 URL 路径
 	parsed, err := url.Parse(rawURL)
 	if err != nil {
 		return false
@@ -198,43 +221,47 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
 		path = "/"
 	}

+	// 遍历所有规则，找到适用的 User-Agent
 	for _, rule := range entry.rules {
 		if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
 			continue
 		}
-		// Check allow first (higher priority)
+		// Allow 优先检查（更高优先级）
 		for _, a := range rule.allow {
 			if strings.HasPrefix(path, a) {
 				return true
 			}
 		}
+		// 再检查 Disallow
 		for _, dis := range rule.disallow {
 			if dis != "" && strings.HasPrefix(path, dis) {
 				return false
 			}
 		}
 	}
-	return true
+	return true // 默认允许
 }

-// fetchRobots downloads and parses robots.txt for a host.
+// fetchRobots 抓取并解析某主机的 robots.txt 文件。
 func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
-	entry := &robotsEntry{fetchedAt: time.Now()}
+	entry := &robotsEntry{fetchedAt: time.Now()} // 初始化空条目（抓取失败时默认允许全部）
 	scheme := "https"
 	if strings.HasPrefix(exampleURL, "http://") {
 		scheme = "http"
 	}
 	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)

+	// robots.txt 单独请求，超时 5 秒
 	client := &http.Client{Timeout: 5 * time.Second}
 	req, _ := http.NewRequest("GET", robotsURL, nil)
 	req.Header.Set("User-Agent", f.userAgent)
 	resp, err := client.Do(req)
 	if err != nil || resp.StatusCode != 200 {
-		return entry // allow all if robots.txt unavailable
+		return entry // robots.txt 不可用时默认允许爬取
 	}
 	defer resp.Body.Close()

+	// 最多读取 256KB（大部分 robots.txt 远小于此大小）
 	body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
 	if err != nil {
 		return entry
@@ -243,16 +270,19 @@ func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
 	return entry
 }

-// parseRobots is a minimal robots.txt parser.
+// parseRobots 最小化 robots.txt 解析器。
+// 支持 User-agent、Disallow、Allow 三种指令，忽略注释和空行。
 func parseRobots(content string) []robotsRule {
 	var rules []robotsRule
 	var current *robotsRule
 	for _, line := range strings.Split(content, "\n") {
 		line = strings.TrimSpace(line)
+		// 去除行内注释
 		if idx := strings.Index(line, "#"); idx >= 0 {
 			line = line[:idx]
 		}
 		if line == "" {
+			// 空行结束当前块
 			if current != nil {
 				rules = append(rules, *current)
 				current = nil
@@ -267,6 +297,7 @@ func parseRobots(content string) []robotsRule {
 		val := strings.TrimSpace(parts[1])
 		switch key {
 		case "user-agent":
+			// 新建一个 User-Agent 块
 			if current == nil {
 				current = &robotsRule{userAgent: val}
 			} else {
@@ -282,23 +313,25 @@ func parseRobots(content string) []robotsRule {
 			}
 		}
 	}
+	// 最后一个块
 	if current != nil {
 		rules = append(rules, *current)
 	}
 	return rules
 }

-// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
+// decodeBody 从响应体读取最多 sizeLimit 字节，自动检测字符集并转为 UTF-8 字符串。
+// sizeLimit <= 0 时不限制大小。
 func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
 	var reader io.Reader = r
 	if sizeLimit > 0 {
-		reader = io.LimitReader(r, int64(sizeLimit))
+		reader = io.LimitReader(r, int64(sizeLimit)) // 限制读取字节数，防止大文件撑爆内存
 	}

-	// Use golang.org/x/net/html/charset for auto-detection
+	// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
 	utf8Reader, err := charset.NewReader(reader, contentType)
 	if err != nil {
-		// Fall back to reading raw and hoping for UTF-8
+		// 备选方案：直接以 UTF-8 读取（可能乱码但不崩溃）
 		data, readErr := io.ReadAll(reader)
 		if readErr != nil {
 			return "", readErr