Signed-off-by: 吴文峰 <kevin@lmve.net>

2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
@@ -0,0 +1,313 @@
+// Package crawler implements the HTTP fetching layer with robots.txt compliance,
+// per-host rate limiting, redirect tracking, and encoding detection.
+package crawler
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+	"sync"
+	"time"
+
+	"golang.org/x/net/html/charset"
+)
+
+// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
+type ErrCrawl struct {
+	Msg string
+}
+
+func (e *ErrCrawl) Error() string { return e.Msg }
+
+// FetchResult bundles the result of a successful fetch.
+type FetchResult struct {
+	Body        string            // decoded HTML body
+	FinalURL    string            // URL after redirects
+	Redirects   map[string]string // permanent redirects: from → to
+	ServerType  string
+}
+
+// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
+type Fetcher struct {
+	client    *http.Client
+	userAgent string
+	cooldown  time.Duration
+
+	rateMu   sync.Mutex
+	lastHit  map[string]time.Time // host → last request time
+
+	robotsMu sync.Mutex
+	robots   map[string]*robotsEntry // host → parsed robots
+}
+
+type robotsEntry struct {
+	rules     []robotsRule
+	fetchedAt time.Time
+}
+
+type robotsRule struct {
+	userAgent string
+	disallow  []string
+	allow     []string
+}
+
+// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
+func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
+	return &Fetcher{
+		client: &http.Client{
+			Timeout: 30 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		userAgent: userAgent,
+		cooldown:  cooldown,
+		lastHit:   make(map[string]time.Time),
+		robots:    make(map[string]*robotsEntry),
+	}
+}
+
+// Fetch fetches url, respecting robots.txt and rate limits.
+// polite=false skips both checks (used by search server snippet fetcher).
+func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
+}
+
+// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
+func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
+	if _, ok := err.(*ErrCrawl); ok {
+		return nil, nil
+	}
+	return res, err
+}
+
+// fetchWithHistory does the actual request and populates redirect history.
+func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	parsed, err := url.Parse(rawURL)
+	if err != nil {
+		return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
+	}
+	host := parsed.Host
+
+	if polite {
+		f.rateLimit(host)
+		if !f.robotsAllowed(rawURL, host) {
+			return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
+		}
+	}
+
+	redirects := make(map[string]string)
+	client := &http.Client{
+		Timeout: timeout,
+		CheckRedirect: func(req *http.Request, via []*http.Request) error {
+			if len(via) >= 10 {
+				return fmt.Errorf("too many redirects")
+			}
+			if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
+				from := via[len(via)-1].URL.String()
+				to := req.URL.String()
+				redirects[from] = to
+			}
+			return nil
+		},
+	}
+
+	req, _ := http.NewRequest("GET", rawURL, nil)
+	req.Header.Set("User-Agent", f.userAgent)
+
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode == 404 {
+		return nil, &ErrCrawl{Msg: "404 not found"}
+	}
+	if resp.StatusCode >= 400 {
+		return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
+	}
+	ct := resp.Header.Get("Content-Type")
+	if !strings.Contains(ct, "text/html") {
+		return nil, &ErrCrawl{Msg: "not html: " + ct}
+	}
+
+	body, err := decodeBody(resp.Body, ct, sizeLimit)
+	if err != nil {
+		return nil, err
+	}
+
+	return &FetchResult{
+		Body:       body,
+		FinalURL:   resp.Request.URL.String(),
+		Redirects:  redirects,
+		ServerType: resp.Header.Get("Server"),
+	}, nil
+}
+
+// rateLimit sleeps if the last request to host was too recent.
+func (f *Fetcher) rateLimit(host string) {
+	f.rateMu.Lock()
+	last, ok := f.lastHit[host]
+	now := time.Now()
+	f.lastHit[host] = now
+	// Periodically prune the map
+	if len(f.lastHit) > 10000 {
+		cutoff := now.Add(-f.cooldown * 2)
+		for k, v := range f.lastHit {
+			if v.Before(cutoff) {
+				delete(f.lastHit, k)
+			}
+		}
+	}
+	f.rateMu.Unlock()
+
+	if ok {
+		elapsed := now.Sub(last)
+		if elapsed < f.cooldown {
+			time.Sleep(f.cooldown - elapsed)
+		}
+	}
+}
+
+// robotsAllowed returns true if rawURL is crawlable.
+func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
+	f.robotsMu.Lock()
+	entry, ok := f.robots[host]
+	f.robotsMu.Unlock()
+
+	if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
+		entry = f.fetchRobots(host, rawURL)
+		f.robotsMu.Lock()
+		f.robots[host] = entry
+		f.robotsMu.Unlock()
+	}
+
+	parsed, err := url.Parse(rawURL)
+	if err != nil {
+		return false
+	}
+	path := parsed.Path
+	if path == "" {
+		path = "/"
+	}
+
+	for _, rule := range entry.rules {
+		if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
+			continue
+		}
+		// Check allow first (higher priority)
+		for _, a := range rule.allow {
+			if strings.HasPrefix(path, a) {
+				return true
+			}
+		}
+		for _, dis := range rule.disallow {
+			if dis != "" && strings.HasPrefix(path, dis) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// fetchRobots downloads and parses robots.txt for a host.
+func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
+	entry := &robotsEntry{fetchedAt: time.Now()}
+	scheme := "https"
+	if strings.HasPrefix(exampleURL, "http://") {
+		scheme = "http"
+	}
+	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
+
+	client := &http.Client{Timeout: 5 * time.Second}
+	req, _ := http.NewRequest("GET", robotsURL, nil)
+	req.Header.Set("User-Agent", f.userAgent)
+	resp, err := client.Do(req)
+	if err != nil || resp.StatusCode != 200 {
+		return entry // allow all if robots.txt unavailable
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
+	if err != nil {
+		return entry
+	}
+	entry.rules = parseRobots(string(body))
+	return entry
+}
+
+// parseRobots is a minimal robots.txt parser.
+func parseRobots(content string) []robotsRule {
+	var rules []robotsRule
+	var current *robotsRule
+	for _, line := range strings.Split(content, "\n") {
+		line = strings.TrimSpace(line)
+		if idx := strings.Index(line, "#"); idx >= 0 {
+			line = line[:idx]
+		}
+		if line == "" {
+			if current != nil {
+				rules = append(rules, *current)
+				current = nil
+			}
+			continue
+		}
+		parts := strings.SplitN(line, ":", 2)
+		if len(parts) != 2 {
+			continue
+		}
+		key := strings.TrimSpace(strings.ToLower(parts[0]))
+		val := strings.TrimSpace(parts[1])
+		switch key {
+		case "user-agent":
+			if current == nil {
+				current = &robotsRule{userAgent: val}
+			} else {
+				current.userAgent = val
+			}
+		case "disallow":
+			if current != nil {
+				current.disallow = append(current.disallow, val)
+			}
+		case "allow":
+			if current != nil {
+				current.allow = append(current.allow, val)
+			}
+		}
+	}
+	if current != nil {
+		rules = append(rules, *current)
+	}
+	return rules
+}
+
+// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
+func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
+	var reader io.Reader = r
+	if sizeLimit > 0 {
+		reader = io.LimitReader(r, int64(sizeLimit))
+	}
+
+	// Use golang.org/x/net/html/charset for auto-detection
+	utf8Reader, err := charset.NewReader(reader, contentType)
+	if err != nil {
+		// Fall back to reading raw and hoping for UTF-8
+		data, readErr := io.ReadAll(reader)
+		if readErr != nil {
+			return "", readErr
+		}
+		return string(data), nil
+	}
+	data, err := io.ReadAll(utf8Reader)
+	if err != nil {
+		return "", err
+	}
+	return string(data), nil
+}