sese-engine-go/crawler/fetcher.go

// Package crawler implements the HTTP fetching layer with robots.txt compliance,
// per-host rate limiting, redirect tracking, and encoding detection.
package crawler

import (
	"fmt"
	"io"
	"net/http"
	"net/url"
	"strings"
	"sync"
	"time"

	"golang.org/x/net/html/charset"
)

// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
type ErrCrawl struct {
	Msg string
}

func (e *ErrCrawl) Error() string { return e.Msg }

// FetchResult bundles the result of a successful fetch.
type FetchResult struct {
	Body        string            // decoded HTML body
	FinalURL    string            // URL after redirects
	Redirects   map[string]string // permanent redirects: from → to
	ServerType  string
}

// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
type Fetcher struct {
	client    *http.Client
	userAgent string
	cooldown  time.Duration

	rateMu   sync.Mutex
	lastHit  map[string]time.Time // host → last request time

	robotsMu sync.Mutex
	robots   map[string]*robotsEntry // host → parsed robots
}

type robotsEntry struct {
	rules     []robotsRule
	fetchedAt time.Time
}

type robotsRule struct {
	userAgent string
	disallow  []string
	allow     []string
}

// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
	return &Fetcher{
		client: &http.Client{
			Timeout: 30 * time.Second,
			CheckRedirect: func(req *http.Request, via []*http.Request) error {
				if len(via) >= 10 {
					return fmt.Errorf("too many redirects")
				}
				return nil
			},
		},
		userAgent: userAgent,
		cooldown:  cooldown,
		lastHit:   make(map[string]time.Time),
		robots:    make(map[string]*robotsEntry),
	}
}

// Fetch fetches url, respecting robots.txt and rate limits.
// polite=false skips both checks (used by search server snippet fetcher).
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
	return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
}

// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
	res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
	if _, ok := err.(*ErrCrawl); ok {
		return nil, nil
	}
	return res, err
}

// fetchWithHistory does the actual request and populates redirect history.
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
	parsed, err := url.Parse(rawURL)
	if err != nil {
		return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
	}
	host := parsed.Host

	if polite {
		f.rateLimit(host)
		if !f.robotsAllowed(rawURL, host) {
			return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
		}
	}

	redirects := make(map[string]string)
	client := &http.Client{
		Timeout: timeout,
		CheckRedirect: func(req *http.Request, via []*http.Request) error {
			if len(via) >= 10 {
				return fmt.Errorf("too many redirects")
			}
			if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
				from := via[len(via)-1].URL.String()
				to := req.URL.String()
				redirects[from] = to
			}
			return nil
		},
	}

	req, _ := http.NewRequest("GET", rawURL, nil)
	req.Header.Set("User-Agent", f.userAgent)

	resp, err := client.Do(req)
	if err != nil {
		return nil, err
	}
	defer resp.Body.Close()

	if resp.StatusCode == 404 {
		return nil, &ErrCrawl{Msg: "404 not found"}
	}
	if resp.StatusCode >= 400 {
		return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
	}
	ct := resp.Header.Get("Content-Type")
	if !strings.Contains(ct, "text/html") {
		return nil, &ErrCrawl{Msg: "not html: " + ct}
	}

	body, err := decodeBody(resp.Body, ct, sizeLimit)
	if err != nil {
		return nil, err
	}

	return &FetchResult{
		Body:       body,
		FinalURL:   resp.Request.URL.String(),
		Redirects:  redirects,
		ServerType: resp.Header.Get("Server"),
	}, nil
}

// rateLimit sleeps if the last request to host was too recent.
func (f *Fetcher) rateLimit(host string) {
	f.rateMu.Lock()
	last, ok := f.lastHit[host]
	now := time.Now()
	f.lastHit[host] = now
	// Periodically prune the map
	if len(f.lastHit) > 10000 {
		cutoff := now.Add(-f.cooldown * 2)
		for k, v := range f.lastHit {
			if v.Before(cutoff) {
				delete(f.lastHit, k)
			}
		}
	}
	f.rateMu.Unlock()

	if ok {
		elapsed := now.Sub(last)
		if elapsed < f.cooldown {
			time.Sleep(f.cooldown - elapsed)
		}
	}
}

// robotsAllowed returns true if rawURL is crawlable.
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
	f.robotsMu.Lock()
	entry, ok := f.robots[host]
	f.robotsMu.Unlock()

	if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
		entry = f.fetchRobots(host, rawURL)
		f.robotsMu.Lock()
		f.robots[host] = entry
		f.robotsMu.Unlock()
	}

	parsed, err := url.Parse(rawURL)
	if err != nil {
		return false
	}
	path := parsed.Path
	if path == "" {
		path = "/"
	}

	for _, rule := range entry.rules {
		if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
			continue
		}
		// Check allow first (higher priority)
		for _, a := range rule.allow {
			if strings.HasPrefix(path, a) {
				return true
			}
		}
		for _, dis := range rule.disallow {
			if dis != "" && strings.HasPrefix(path, dis) {
				return false
			}
		}
	}
	return true
}

// fetchRobots downloads and parses robots.txt for a host.
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
	entry := &robotsEntry{fetchedAt: time.Now()}
	scheme := "https"
	if strings.HasPrefix(exampleURL, "http://") {
		scheme = "http"
	}
	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)

	client := &http.Client{Timeout: 5 * time.Second}
	req, _ := http.NewRequest("GET", robotsURL, nil)
	req.Header.Set("User-Agent", f.userAgent)
	resp, err := client.Do(req)
	if err != nil || resp.StatusCode != 200 {
		return entry // allow all if robots.txt unavailable
	}
	defer resp.Body.Close()

	body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
	if err != nil {
		return entry
	}
	entry.rules = parseRobots(string(body))
	return entry
}

// parseRobots is a minimal robots.txt parser.
func parseRobots(content string) []robotsRule {
	var rules []robotsRule
	var current *robotsRule
	for _, line := range strings.Split(content, "\n") {
		line = strings.TrimSpace(line)
		if idx := strings.Index(line, "#"); idx >= 0 {
			line = line[:idx]
		}
		if line == "" {
			if current != nil {
				rules = append(rules, *current)
				current = nil
			}
			continue
		}
		parts := strings.SplitN(line, ":", 2)
		if len(parts) != 2 {
			continue
		}
		key := strings.TrimSpace(strings.ToLower(parts[0]))
		val := strings.TrimSpace(parts[1])
		switch key {
		case "user-agent":
			if current == nil {
				current = &robotsRule{userAgent: val}
			} else {
				current.userAgent = val
			}
		case "disallow":
			if current != nil {
				current.disallow = append(current.disallow, val)
			}
		case "allow":
			if current != nil {
				current.allow = append(current.allow, val)
			}
		}
	}
	if current != nil {
		rules = append(rules, *current)
	}
	return rules
}

// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
	var reader io.Reader = r
	if sizeLimit > 0 {
		reader = io.LimitReader(r, int64(sizeLimit))
	}

	// Use golang.org/x/net/html/charset for auto-detection
	utf8Reader, err := charset.NewReader(reader, contentType)
	if err != nil {
		// Fall back to reading raw and hoping for UTF-8
		data, readErr := io.ReadAll(reader)
		if readErr != nil {
			return "", readErr
		}
		return string(data), nil
	}
	data, err := io.ReadAll(utf8Reader)
	if err != nil {
		return "", err
	}
	return string(data), nil
}