加上中文注释

2026-04-08 17:48:05 +08:00
parent 6c2f5ad978
commit c154abf410
11 changed files with 830 additions and 560 deletions
@@ -1,64 +1,71 @@
 // Package crawler implements the HTTP fetching layer with robots.txt compliance,
 // per-host rate limiting, redirect tracking, and encoding detection.
+// crawler 包负责 HTTP 请求层：遵守 robots.txt、主机限流、追踪重定向、自动检测字符集。
 package crawler

 import (
-	"fmt"
-	"io"
-	"net/http"
-	"net/url"
-	"strings"
-	"sync"
-	"time"
+	"fmt"       // 字符串格式化（构建 robots.txt URL、错误信息）
+	"io"        // IO 接口（读取响应体）
+	"net/http"  // HTTP 客户端
+	"net/url"   // URL 解析
+	"strings"   // 字符串操作
+	"sync"      // 互斥锁（保护限流表和 robots.txt 缓存）
+	"time"      // 时间（限流间隔计算、robots.txt 缓存过期）

-	"golang.org/x/net/html/charset"
+	"golang.org/x/net/html/charset" // HTML 字符集自动检测（将各种编码转为 UTF-8）
 )

-// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
+// ErrCrawl 表示爬取过程中的预期错误（404、被 robots.txt 禁止、非 HTML 类型等）。
+// 此类错误由 FetchSafe 静默丢弃（返回 nil, nil）。
 type ErrCrawl struct {
-	Msg string
+	Msg string // 错误描述文本
 }

+// Error 实现 error 接口，返回错误描述。
 func (e *ErrCrawl) Error() string { return e.Msg }

-// FetchResult bundles the result of a successful fetch.
+// FetchResult 封装一次成功抓取的完整结果。
 type FetchResult struct {
-	Body        string            // decoded HTML body
-	FinalURL    string            // URL after redirects
-	Redirects   map[string]string // permanent redirects: from → to
-	ServerType  string
+	Body        string            // 解码后的 HTML 正文（UTF-8）
+	FinalURL    string            // 经过所有重定向后的最终 URL
+	Redirects   map[string]string // 永久重定向（301/308）映射：原始 URL → 最终 URL
+	ServerType  string            // HTTP Server 响应头（如 "nginx/1.18"）
 }

-// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
+// Fetcher 是一个可复用的 HTTP 客户端，内置 robots.txt 合规检查和按主机限流。
 type Fetcher struct {
-	client    *http.Client
-	userAgent string
-	cooldown  time.Duration
+	client    *http.Client // HTTP 客户端（包含重定向和超时控制）
+	userAgent string       // HTTP 请求的 User-Agent 头
+	cooldown  time.Duration // 同一主机相邻两次请求的最小间隔

-	rateMu   sync.Mutex
-	lastHit  map[string]time.Time // host → last request time
+	rateMu   sync.Mutex          // 保护 lastHit 限流表的互斥锁
+	lastHit  map[string]time.Time // 主机名 → 上次请求时间（用于计算限流等待）

-	robotsMu sync.Mutex
-	robots   map[string]*robotsEntry // host → parsed robots
+	robotsMu sync.Mutex                      // 保护 robots 缓存的互斥锁
+	robots   map[string]*robotsEntry // 主机名 → 该主机的 robots.txt 解析结果（含缓存时间）
 }

+// robotsEntry 缓存单台主机的 robots.txt 解析结果。
 type robotsEntry struct {
-	rules     []robotsRule
-	fetchedAt time.Time
+	rules     []robotsRule // 解析后的规则列表
+	fetchedAt time.Time    // 缓存时间（用于判断是否过期，24h 后重新抓取）
 }

+// robotsRule 一条 robots.txt 规则，对应一个 User-Agent 块。
 type robotsRule struct {
-	userAgent string
-	disallow  []string
-	allow     []string
+	userAgent string   // 适用的爬虫名称（"*" 表示全部）
+	disallow  []string // Disallow 路径列表
+	allow     []string // Allow 路径列表（优先于 disallow）
 }

-// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
+// NewFetcher 创建一个新的 Fetcher 实例。
+// userAgent：发出的 HTTP 请求的 User-Agent；cooldown：同一主机相邻请求的最小间隔。
 func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
 	return &Fetcher{
 		client: &http.Client{
-			Timeout: 30 * time.Second,
+			Timeout: 30 * time.Second, // 默认单次请求超时 30 秒
 			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				// 跟随重定向最多 10 次，防止重定向循环
 				if len(via) >= 10 {
 					return fmt.Errorf("too many redirects")
 				}
@@ -67,34 +74,37 @@ func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
 		},
 		userAgent: userAgent,
 		cooldown:  cooldown,
-		lastHit:   make(map[string]time.Time),
-		robots:    make(map[string]*robotsEntry),
+		lastHit:   make(map[string]time.Time), // 限流表初始化
+		robots:    make(map[string]*robotsEntry), // robots.txt 缓存初始化
 	}
 }

-// Fetch fetches url, respecting robots.txt and rate limits.
-// polite=false skips both checks (used by search server snippet fetcher).
+// Fetch 抓取指定 URL，遵守 robots.txt 和主机限流。
+// polite=false 时跳过 robots.txt 检查和限流（用于搜索服务在线抓摘要）。
 func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
 	return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
 }

-// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
+// FetchSafe 封装 Fetch，在遇到预期爬取错误（404/disallowed/非 HTML）时返回 (nil, nil)。
+// 调用方无需区分错误类型，直接跳过即可。
 func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
 	res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
 	if _, ok := err.(*ErrCrawl); ok {
-		return nil, nil
+		return nil, nil // 预期错误，静默丢弃
 	}
 	return res, err
 }

-// fetchWithHistory does the actual request and populates redirect history.
+// fetchWithHistory 执行实际 HTTP 请求，追踪永久重定向。
 func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	// 解析 URL 提取主机名
 	parsed, err := url.Parse(rawURL)
 	if err != nil {
 		return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
 	}
 	host := parsed.Host

+	// polite 模式：先限流，再检查 robots.txt
 	if polite {
 		f.rateLimit(host)
 		if !f.robotsAllowed(rawURL, host) {
@@ -102,6 +112,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
 		}
 	}

+	// 追踪永久重定向（301/308）
 	redirects := make(map[string]string)
 	client := &http.Client{
 		Timeout: timeout,
@@ -109,6 +120,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
 			if len(via) >= 10 {
 				return fmt.Errorf("too many redirects")
 			}
+			// 记录永久重定向
 			if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
 				from := via[len(via)-1].URL.String()
 				to := req.URL.String()
@@ -118,26 +130,32 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
 		},
 	}

+	// 构造 GET 请求
 	req, _ := http.NewRequest("GET", rawURL, nil)
 	req.Header.Set("User-Agent", f.userAgent)

+	// 发送请求
 	resp, err := client.Do(req)
 	if err != nil {
 		return nil, err
 	}
-	defer resp.Body.Close()
+	defer resp.Body.Close() // 读取完毕后关闭响应体

+	// 检查 HTTP 状态码
 	if resp.StatusCode == 404 {
 		return nil, &ErrCrawl{Msg: "404 not found"}
 	}
 	if resp.StatusCode >= 400 {
 		return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
 	}
+
+	// 检查 Content-Type，必须是 HTML 才继续
 	ct := resp.Header.Get("Content-Type")
 	if !strings.Contains(ct, "text/html") {
 		return nil, &ErrCrawl{Msg: "not html: " + ct}
 	}

+	// 解码响应体（自动检测字符集转为 UTF-8）
 	body, err := decodeBody(resp.Body, ct, sizeLimit)
 	if err != nil {
 		return nil, err
@@ -145,19 +163,20 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura

 	return &FetchResult{
 		Body:       body,
-		FinalURL:   resp.Request.URL.String(),
-		Redirects:  redirects,
+		FinalURL:   resp.Request.URL.String(), // 重定向后的最终 URL
+		Redirects:   redirects,
 		ServerType: resp.Header.Get("Server"),
 	}, nil
 }

-// rateLimit sleeps if the last request to host was too recent.
+// rateLimit 检查并强制执行主机限流：若距上次请求不足 cooldown 秒则 sleep 等待。
 func (f *Fetcher) rateLimit(host string) {
 	f.rateMu.Lock()
 	last, ok := f.lastHit[host]
 	now := time.Now()
 	f.lastHit[host] = now
-	// Periodically prune the map
+
+	// 限流表超过 10000 条时清理两倍 cooldown 时间之前的过期项，防止内存泄漏
 	if len(f.lastHit) > 10000 {
 		cutoff := now.Add(-f.cooldown * 2)
 		for k, v := range f.lastHit {
@@ -168,6 +187,7 @@ func (f *Fetcher) rateLimit(host string) {
 	}
 	f.rateMu.Unlock()

+	// 计算需要等待的时间
 	if ok {
 		elapsed := now.Sub(last)
 		if elapsed < f.cooldown {
@@ -176,12 +196,14 @@ func (f *Fetcher) rateLimit(host string) {
 	}
 }

-// robotsAllowed returns true if rawURL is crawlable.
+// robotsAllowed 根据 robots.txt 规则判断某 URL 是否允许爬取。
 func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
+	// 尝试从缓存读取（加锁保护）
 	f.robotsMu.Lock()
 	entry, ok := f.robots[host]
 	f.robotsMu.Unlock()

+	// 缓存不存在或已过期（超过 24 小时）则重新抓取并解析
 	if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
 		entry = f.fetchRobots(host, rawURL)
 		f.robotsMu.Lock()
@@ -189,6 +211,7 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
 		f.robotsMu.Unlock()
 	}

+	// 解析 URL 路径
 	parsed, err := url.Parse(rawURL)
 	if err != nil {
 		return false
@@ -198,43 +221,47 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
 		path = "/"
 	}

+	// 遍历所有规则，找到适用的 User-Agent
 	for _, rule := range entry.rules {
 		if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
 			continue
 		}
-		// Check allow first (higher priority)
+		// Allow 优先检查（更高优先级）
 		for _, a := range rule.allow {
 			if strings.HasPrefix(path, a) {
 				return true
 			}
 		}
+		// 再检查 Disallow
 		for _, dis := range rule.disallow {
 			if dis != "" && strings.HasPrefix(path, dis) {
 				return false
 			}
 		}
 	}
-	return true
+	return true // 默认允许
 }

-// fetchRobots downloads and parses robots.txt for a host.
+// fetchRobots 抓取并解析某主机的 robots.txt 文件。
 func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
-	entry := &robotsEntry{fetchedAt: time.Now()}
+	entry := &robotsEntry{fetchedAt: time.Now()} // 初始化空条目（抓取失败时默认允许全部）
 	scheme := "https"
 	if strings.HasPrefix(exampleURL, "http://") {
 		scheme = "http"
 	}
 	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)

+	// robots.txt 单独请求，超时 5 秒
 	client := &http.Client{Timeout: 5 * time.Second}
 	req, _ := http.NewRequest("GET", robotsURL, nil)
 	req.Header.Set("User-Agent", f.userAgent)
 	resp, err := client.Do(req)
 	if err != nil || resp.StatusCode != 200 {
-		return entry // allow all if robots.txt unavailable
+		return entry // robots.txt 不可用时默认允许爬取
 	}
 	defer resp.Body.Close()

+	// 最多读取 256KB（大部分 robots.txt 远小于此大小）
 	body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
 	if err != nil {
 		return entry
@@ -243,16 +270,19 @@ func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
 	return entry
 }

-// parseRobots is a minimal robots.txt parser.
+// parseRobots 最小化 robots.txt 解析器。
+// 支持 User-agent、Disallow、Allow 三种指令，忽略注释和空行。
 func parseRobots(content string) []robotsRule {
 	var rules []robotsRule
 	var current *robotsRule
 	for _, line := range strings.Split(content, "\n") {
 		line = strings.TrimSpace(line)
+		// 去除行内注释
 		if idx := strings.Index(line, "#"); idx >= 0 {
 			line = line[:idx]
 		}
 		if line == "" {
+			// 空行结束当前块
 			if current != nil {
 				rules = append(rules, *current)
 				current = nil
@@ -267,6 +297,7 @@ func parseRobots(content string) []robotsRule {
 		val := strings.TrimSpace(parts[1])
 		switch key {
 		case "user-agent":
+			// 新建一个 User-Agent 块
 			if current == nil {
 				current = &robotsRule{userAgent: val}
 			} else {
@@ -282,23 +313,25 @@ func parseRobots(content string) []robotsRule {
 			}
 		}
 	}
+	// 最后一个块
 	if current != nil {
 		rules = append(rules, *current)
 	}
 	return rules
 }

-// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
+// decodeBody 从响应体读取最多 sizeLimit 字节，自动检测字符集并转为 UTF-8 字符串。
+// sizeLimit <= 0 时不限制大小。
 func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
 	var reader io.Reader = r
 	if sizeLimit > 0 {
-		reader = io.LimitReader(r, int64(sizeLimit))
+		reader = io.LimitReader(r, int64(sizeLimit)) // 限制读取字节数，防止大文件撑爆内存
 	}

-	// Use golang.org/x/net/html/charset for auto-detection
+	// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
 	utf8Reader, err := charset.NewReader(reader, contentType)
 	if err != nil {
-		// Fall back to reading raw and hoping for UTF-8
+		// 备选方案：直接以 UTF-8 读取（可能乱码但不崩溃）
 		data, readErr := io.ReadAll(reader)
 		if readErr != nil {
 			return "", readErr