Signed-off-by: 吴文峰 <kevin@lmve.net>

2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
@@ -0,0 +1,588 @@
+// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
+package crawler
+
+import (
+	"bytes"
+	"encoding/json"
+	"log"
+	"math"
+	"math/rand"
+	"net/http"
+	"net/url"
+	"strings"
+	"sync"
+	"sync/atomic"
+	"time"
+
+	"sese-engine/analyzer"
+	"sese-engine/config"
+	"sese-engine/parser"
+	"sese-engine/storage"
+)
+
+
+// Stats holds real-time crawl counters (read with atomic).
+type Stats struct {
+	VisitedURLs    int64
+	SuccessURLs    int64
+	KeywordsFetched int64
+}
+
+// Crawler orchestrates the BFS crawl.
+type Crawler struct {
+	fetcher    *Fetcher
+	db         *storage.DB
+	analyzer   *analyzer.Analyzer
+	prosperMap map[string]float64 // domain → backlink score (loaded from info)
+	stats      Stats
+}
+
+// New creates a Crawler.
+func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
+	return &Crawler{
+		fetcher:    NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
+		db:         db,
+		analyzer:   a,
+		prosperMap: prosperMap,
+	}
+}
+
+// URLWeight pairs a URL with its discovery weight.
+type URLWeight struct {
+	URL    string
+	Weight float64
+}
+
+// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
+// It blocks until completion.
+func (c *Crawler) Run(entryURL string, maxEpoch int) {
+	visited := make(map[string]bool)
+	queue := []string{entryURL}
+
+	for ep := 0; ep < maxEpoch; ep++ {
+		log.Printf("[crawler] epoch %d/%d  queue=%d", ep+1, maxEpoch, len(queue))
+		for _, u := range queue {
+			visited[u] = true
+		}
+
+		var (
+			newLinks []URLWeight
+			mu       sync.Mutex
+			wg       sync.WaitGroup
+		)
+
+		sem := make(chan struct{}, config.CrawlerWorkers)
+		for _, u := range queue {
+			wg.Add(1)
+			sem <- struct{}{}
+			go func(rawURL string) {
+				defer wg.Done()
+				defer func() { <-sem }()
+				hrefs := c.visitURL(rawURL)
+				n := len(hrefs)
+				if n > 0 {
+					w := 1.0 / float64(n)
+					mu.Lock()
+					for _, h := range hrefs {
+						if !visited[h] {
+							newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
+						}
+					}
+					mu.Unlock()
+				}
+			}(u)
+		}
+		wg.Wait()
+
+		if len(newLinks) == 0 {
+			log.Println("[crawler] empty queue — stopping")
+			return
+		}
+
+		queue = c.schedule(newLinks)
+	}
+}
+
+// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
+func (c *Crawler) visitURL(rawURL string) []string {
+	atomic.AddInt64(&c.stats.VisitedURLs, 1)
+
+	res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
+	if err != nil || res == nil {
+		c.updateSiteFailure(rawURL)
+		return nil
+	}
+
+	atomic.AddInt64(&c.stats.SuccessURLs, 1)
+
+	title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
+
+	// Cache snippet
+	if len(res.FinalURL) < 250 {
+		_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
+			Title:       title,
+			Description: truncate(desc, 256),
+			Text:        truncate(text, 256),
+			Timestamp:   time.Now().Unix(),
+		})
+	}
+
+	// Keyword extraction → send to harvester
+	kws := c.analyzer.Analyze(title, desc, text)
+	if len(kws) > 0 {
+		if len(kws) > config.MaxKeywordsPerPage {
+			kws = kws[:config.MaxKeywordsPerPage]
+		}
+		atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
+		go c.sendToHarvester(res.FinalURL, kws)
+	}
+
+	// Update site info
+	host := netloc(res.FinalURL)
+	c.updateSiteSuccess(host, res, title, desc, text, hrefs)
+
+	// Handle permanent redirects in site info
+	for from, to := range res.Redirects {
+		fromHost := netloc(from)
+		if fromHost == "" {
+			continue
+		}
+		info, _ := c.db.GetSiteInfo(fromHost)
+		if info.Redirects == nil {
+			info.Redirects = make(map[string]string)
+		}
+		info.Redirects[from] = to
+		if len(info.Redirects) > 50 {
+			// keep most important (just truncate randomly for now)
+			info.Redirects = truncateMap(info.Redirects, 40)
+		}
+		_ = c.db.SetSiteInfo(fromHost, info)
+	}
+
+	// Trim hrefs
+	if len(hrefs) > 100 {
+		hrefs = sampleStrings(hrefs, 100)
+	}
+	return hrefs
+}
+
+func (c *Crawler) updateSiteFailure(rawURL string) {
+	host := netloc(rawURL)
+	if host == "" {
+		return
+	}
+	info, _ := c.db.GetSiteInfo(host)
+	if info.SuccessRate == nil {
+		zero := 0.0
+		info.SuccessRate = &zero
+	}
+	*info.SuccessRate *= 0.99
+	_ = c.db.SetSiteInfo(host, info)
+}
+
+func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
+	info, _ := c.db.GetSiteInfo(host)
+
+	info.VisitCount++
+	info.LastVisitTime = time.Now().Unix()
+
+	one := 1.0
+	if info.SuccessRate == nil {
+		info.SuccessRate = &one
+	}
+	*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
+
+	if strings.HasPrefix(res.FinalURL, "https://") {
+		t := true
+		info.HTTPSAvailable = &t
+	}
+
+	if res.ServerType != "" {
+		found := false
+		for _, s := range info.ServerTypes {
+			if s == res.ServerType {
+				found = true
+				break
+			}
+		}
+		if !found {
+			info.ServerTypes = append(info.ServerTypes, res.ServerType)
+			if len(info.ServerTypes) > 5 {
+				info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
+			}
+		}
+	}
+
+	// Language detection — sample 10% or first 10 visits
+	if info.VisitCount < 10 || rand.Float64() < 0.1 {
+		lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
+		if lang != "" {
+			if info.Languages == nil {
+				info.Languages = make(map[string]float64)
+			}
+			intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
+			for k := range info.Languages {
+				info.Languages[k] *= (1 - intensity)
+			}
+			info.Languages[lang] += intensity
+		}
+		// Collect external links
+		superHost := superNetloc(res.FinalURL)
+		var external []string
+		for _, h := range hrefs {
+			if superNetloc(h) != superHost {
+				external = append(external, h)
+			}
+		}
+		sampled := sampleStrings(external, 10)
+		info.OutLinks = append(info.OutLinks, sampled...)
+		if len(info.OutLinks) > 250 {
+			info.OutLinks = sampleStrings(info.OutLinks, 200)
+		}
+	}
+
+	_ = c.db.SetSiteInfo(host, info)
+}
+
+// sendToHarvester POSTs keyword data to the harvester service.
+func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
+	type payload struct {
+		URL      string           `json:"url"`
+		Keywords []analyzer.Keyword `json:"keywords"`
+	}
+	p := payload{URL: finalURL, Keywords: kws}
+	data, err := json.Marshal(p)
+	if err != nil {
+		return
+	}
+	resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data))
+	if err != nil {
+		log.Printf("[crawler] harvester post failed: %v", err)
+		return
+	}
+	resp.Body.Close()
+}
+
+// schedule selects and prioritises the next BFS queue from raw discovered links.
+func (c *Crawler) schedule(links []URLWeight) []string {
+	if len(links) > 100000 {
+		links = sampleURLWeights(links, 100000)
+	}
+
+	// Pre-fetch site info for all involved domains
+	domains := make(map[string]bool)
+	for _, lw := range links {
+		if h := netloc(lw.URL); h != "" {
+			domains[h] = true
+		}
+		if h := superNetloc(lw.URL); h != "" {
+			domains[h] = true
+		}
+	}
+	siteCache := make(map[string]*storage.SiteInfo, len(domains))
+	var mu sync.Mutex
+	var wg sync.WaitGroup
+	for d := range domains {
+		wg.Add(1)
+		go func(host string) {
+			defer wg.Done()
+			info, _ := c.db.GetSiteInfo(host)
+			mu.Lock()
+			siteCache[host] = info
+			mu.Unlock()
+		}(d)
+	}
+	wg.Wait()
+
+	// Score each URL
+	scored_list := make([]scoredURL, len(links))
+	for i, lw := range links {
+		scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
+	}
+
+	// Weighted random sample (45000 or 1/3+250 whichever smaller)
+	k := min(45000, len(scored_list)/3+250)
+	selected := weightedSample(scored_list, k)
+
+	// Domain concentration filtering
+	selected = concentrationFilter(selected, config.CrawlFocus)
+
+	// Separate https/http, cap http at 1/4 of https count
+	var httpsURLs, httpURLs []string
+	for _, s := range selected {
+		if strings.HasPrefix(s, "https://") {
+			httpsURLs = append(httpsURLs, s)
+		} else {
+			httpURLs = append(httpURLs, s)
+		}
+	}
+	maxHTTP := len(httpsURLs) / 4
+	if len(httpURLs) > maxHTTP {
+		httpURLs = sampleStrings(httpURLs, maxHTTP)
+	}
+
+	// Separate prosperous / non-prosperous
+	var prosperURLs, otherURLs []string
+	for _, u := range append(httpsURLs, httpURLs...) {
+		if c.prosperMap[netloc(u)] > 0 {
+			prosperURLs = append(prosperURLs, u)
+		} else {
+			otherURLs = append(otherURLs, u)
+		}
+	}
+	n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
+	if len(otherURLs) > n {
+		keep := max(len(otherURLs)-len(selected)/10, n)
+		if keep < len(otherURLs) {
+			otherURLs = sampleStrings(otherURLs, keep)
+		}
+	}
+
+	result := append(prosperURLs, otherURLs...)
+	rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
+	return result
+}
+
+// scoreURL computes the scheduling priority for a URL.
+func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
+	host := netloc(lw.URL)
+	super := superNetloc(lw.URL)
+
+	info := siteCache[host]
+	if info == nil {
+		info = &storage.SiteInfo{}
+	}
+
+	// Chinese-ness
+	var chineseness float64 = 0.5
+	if len(info.Languages) > 0 {
+		total := 0.0
+		for _, v := range info.Languages {
+			total += v
+		}
+		if total > 0 {
+			chineseness = info.Languages["zh"] / total
+		}
+	}
+
+	// Interest decay based on visit count
+	prosper := math.Min(62, c.prosperMap[host])
+	limit := prosper*500 + 50
+	b := math.Pow(0.1, 1/limit)
+	interest := math.Pow(b, float64(info.VisitCount))
+
+	var interest2 float64 = 1.0
+	if super != host {
+		superInfo := siteCache[super]
+		if superInfo != nil {
+			limit2 := math.Min(62, c.prosperMap[super])*500 + 50
+			b2 := math.Pow(0.1, 1/limit2)
+			interest2 = math.Pow(b2, float64(superInfo.VisitCount))
+		}
+	}
+
+	quality := 1.0
+	if info.Quality != nil {
+		quality = *info.Quality
+	}
+
+	prosperity := prosper
+	if prosperity > 0 {
+		prosperity += 0.5
+	}
+	prosperity = math.Log2(2+prosperity) + 1
+
+	bad := badURL(lw.URL)
+	return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
+}
+
+// ---- helper functions ----
+
+func netloc(rawURL string) string {
+	parts := strings.SplitN(rawURL, "/", 4)
+	if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
+		return parts[2]
+	}
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return ""
+	}
+	return u.Host
+}
+
+// superNetloc returns "domain.tld" (strips subdomains).
+func superNetloc(rawURL string) string {
+	host := netloc(rawURL)
+	parts := strings.Split(host, ".")
+	if len(parts) >= 2 {
+		return strings.Join(parts[len(parts)-2:], ".")
+	}
+	return host
+}
+
+func badURL(u string) float64 {
+	s := math.Max(0, float64(len(u)-30)/200.0)
+	if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
+		s += (1 - s) * 0.3
+	}
+	if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
+		s += (1 - s) * 0.1
+	}
+	if len(u) < 5 || u[4] == ':' {
+		s += (1 - s) * 0.3
+	}
+	return math.Min(s, 0.9)
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n]
+}
+
+func sampleStrings(s []string, n int) []string {
+	if len(s) <= n {
+		return s
+	}
+	perm := rand.Perm(len(s))
+	out := make([]string, n)
+	for i := range out {
+		out[i] = s[perm[i]]
+	}
+	return out
+}
+
+func sampleURLWeights(s []URLWeight, n int) []URLWeight {
+	if len(s) <= n {
+		return s
+	}
+	perm := rand.Perm(len(s))
+	out := make([]URLWeight, n)
+	for i := range out {
+		out[i] = s[perm[i]]
+	}
+	return out
+}
+
+type scoredURL struct {
+	url   string
+	score float64
+}
+
+func weightedSample(items []scoredURL, k int) []string {
+	if k >= len(items) {
+		out := make([]string, len(items))
+		for i, s := range items {
+			out[i] = s.url
+		}
+		return out
+	}
+	// Simple weighted sampling without replacement using alias method approximation
+	totalWeight := 0.0
+	for _, s := range items {
+		totalWeight += s.score
+	}
+	selected := make(map[int]bool)
+	out := make([]string, 0, k)
+	for len(out) < k && len(selected) < len(items) {
+		r := rand.Float64() * totalWeight
+		cum := 0.0
+		for i, s := range items {
+			if selected[i] {
+				continue
+			}
+			cum += s.score
+			if cum >= r {
+				selected[i] = true
+				out = append(out, s.url)
+				totalWeight -= s.score
+				break
+			}
+		}
+	}
+	return out
+}
+
+func concentrationFilter(urls []string, k float64) []string {
+	domainGroups := make(map[string][]string)
+	shuffled := make([]string, len(urls))
+	copy(shuffled, urls)
+	rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })
+
+	for _, u := range shuffled {
+		d := superNetloc(u)
+		domainGroups[d] = append(domainGroups[d], u)
+	}
+
+	limit := 10
+	if len(domainGroups) > 1 {
+		sizes := make([]int, 0, len(domainGroups))
+		for _, g := range domainGroups {
+			sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
+		}
+		// sort sizes ascending, drop last (largest)
+		for i := 0; i < len(sizes)-1; i++ {
+			for j := i + 1; j < len(sizes)-1; j++ {
+				if sizes[j] < sizes[i] {
+					sizes[i], sizes[j] = sizes[j], sizes[i]
+				}
+			}
+		}
+		total := 0
+		for _, s := range sizes[:len(sizes)-1] {
+			total += s
+		}
+		limit = max(10, int(float64(total)*0.6))
+	}
+
+	var result []string
+	for _, g := range domainGroups {
+		sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
+		if sn > len(g) {
+			sn = len(g)
+		}
+		result = append(result, g[:sn]...)
+	}
+	rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
+	return result
+}
+
+func truncateMap(m map[string]string, n int) map[string]string {
+	if len(m) <= n {
+		return m
+	}
+	out := make(map[string]string, n)
+	i := 0
+	for k, v := range m {
+		if i >= n {
+			break
+		}
+		out[k] = v
+		i++
+	}
+	return out
+}
+
+func min(a, b int) int {
+	if a < b {
+		return a
+	}
+	return b
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
+
+// Expose Stats for monitoring.
+func (c *Crawler) GetStats() Stats {
+	return Stats{
+		VisitedURLs:     atomic.LoadInt64(&c.stats.VisitedURLs),
+		SuccessURLs:     atomic.LoadInt64(&c.stats.SuccessURLs),
+		KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
+	}
+}
@@ -0,0 +1,313 @@
+// Package crawler implements the HTTP fetching layer with robots.txt compliance,
+// per-host rate limiting, redirect tracking, and encoding detection.
+package crawler
+
+import (
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"strings"
+	"sync"
+	"time"
+
+	"golang.org/x/net/html/charset"
+)
+
+// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
+type ErrCrawl struct {
+	Msg string
+}
+
+func (e *ErrCrawl) Error() string { return e.Msg }
+
+// FetchResult bundles the result of a successful fetch.
+type FetchResult struct {
+	Body        string            // decoded HTML body
+	FinalURL    string            // URL after redirects
+	Redirects   map[string]string // permanent redirects: from → to
+	ServerType  string
+}
+
+// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
+type Fetcher struct {
+	client    *http.Client
+	userAgent string
+	cooldown  time.Duration
+
+	rateMu   sync.Mutex
+	lastHit  map[string]time.Time // host → last request time
+
+	robotsMu sync.Mutex
+	robots   map[string]*robotsEntry // host → parsed robots
+}
+
+type robotsEntry struct {
+	rules     []robotsRule
+	fetchedAt time.Time
+}
+
+type robotsRule struct {
+	userAgent string
+	disallow  []string
+	allow     []string
+}
+
+// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
+func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
+	return &Fetcher{
+		client: &http.Client{
+			Timeout: 30 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return fmt.Errorf("too many redirects")
+				}
+				return nil
+			},
+		},
+		userAgent: userAgent,
+		cooldown:  cooldown,
+		lastHit:   make(map[string]time.Time),
+		robots:    make(map[string]*robotsEntry),
+	}
+}
+
+// Fetch fetches url, respecting robots.txt and rate limits.
+// polite=false skips both checks (used by search server snippet fetcher).
+func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
+}
+
+// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
+func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
+	if _, ok := err.(*ErrCrawl); ok {
+		return nil, nil
+	}
+	return res, err
+}
+
+// fetchWithHistory does the actual request and populates redirect history.
+func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
+	parsed, err := url.Parse(rawURL)
+	if err != nil {
+		return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
+	}
+	host := parsed.Host
+
+	if polite {
+		f.rateLimit(host)
+		if !f.robotsAllowed(rawURL, host) {
+			return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
+		}
+	}
+
+	redirects := make(map[string]string)
+	client := &http.Client{
+		Timeout: timeout,
+		CheckRedirect: func(req *http.Request, via []*http.Request) error {
+			if len(via) >= 10 {
+				return fmt.Errorf("too many redirects")
+			}
+			if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
+				from := via[len(via)-1].URL.String()
+				to := req.URL.String()
+				redirects[from] = to
+			}
+			return nil
+		},
+	}
+
+	req, _ := http.NewRequest("GET", rawURL, nil)
+	req.Header.Set("User-Agent", f.userAgent)
+
+	resp, err := client.Do(req)
+	if err != nil {
+		return nil, err
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode == 404 {
+		return nil, &ErrCrawl{Msg: "404 not found"}
+	}
+	if resp.StatusCode >= 400 {
+		return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
+	}
+	ct := resp.Header.Get("Content-Type")
+	if !strings.Contains(ct, "text/html") {
+		return nil, &ErrCrawl{Msg: "not html: " + ct}
+	}
+
+	body, err := decodeBody(resp.Body, ct, sizeLimit)
+	if err != nil {
+		return nil, err
+	}
+
+	return &FetchResult{
+		Body:       body,
+		FinalURL:   resp.Request.URL.String(),
+		Redirects:  redirects,
+		ServerType: resp.Header.Get("Server"),
+	}, nil
+}
+
+// rateLimit sleeps if the last request to host was too recent.
+func (f *Fetcher) rateLimit(host string) {
+	f.rateMu.Lock()
+	last, ok := f.lastHit[host]
+	now := time.Now()
+	f.lastHit[host] = now
+	// Periodically prune the map
+	if len(f.lastHit) > 10000 {
+		cutoff := now.Add(-f.cooldown * 2)
+		for k, v := range f.lastHit {
+			if v.Before(cutoff) {
+				delete(f.lastHit, k)
+			}
+		}
+	}
+	f.rateMu.Unlock()
+
+	if ok {
+		elapsed := now.Sub(last)
+		if elapsed < f.cooldown {
+			time.Sleep(f.cooldown - elapsed)
+		}
+	}
+}
+
+// robotsAllowed returns true if rawURL is crawlable.
+func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
+	f.robotsMu.Lock()
+	entry, ok := f.robots[host]
+	f.robotsMu.Unlock()
+
+	if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
+		entry = f.fetchRobots(host, rawURL)
+		f.robotsMu.Lock()
+		f.robots[host] = entry
+		f.robotsMu.Unlock()
+	}
+
+	parsed, err := url.Parse(rawURL)
+	if err != nil {
+		return false
+	}
+	path := parsed.Path
+	if path == "" {
+		path = "/"
+	}
+
+	for _, rule := range entry.rules {
+		if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
+			continue
+		}
+		// Check allow first (higher priority)
+		for _, a := range rule.allow {
+			if strings.HasPrefix(path, a) {
+				return true
+			}
+		}
+		for _, dis := range rule.disallow {
+			if dis != "" && strings.HasPrefix(path, dis) {
+				return false
+			}
+		}
+	}
+	return true
+}
+
+// fetchRobots downloads and parses robots.txt for a host.
+func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
+	entry := &robotsEntry{fetchedAt: time.Now()}
+	scheme := "https"
+	if strings.HasPrefix(exampleURL, "http://") {
+		scheme = "http"
+	}
+	robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
+
+	client := &http.Client{Timeout: 5 * time.Second}
+	req, _ := http.NewRequest("GET", robotsURL, nil)
+	req.Header.Set("User-Agent", f.userAgent)
+	resp, err := client.Do(req)
+	if err != nil || resp.StatusCode != 200 {
+		return entry // allow all if robots.txt unavailable
+	}
+	defer resp.Body.Close()
+
+	body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
+	if err != nil {
+		return entry
+	}
+	entry.rules = parseRobots(string(body))
+	return entry
+}
+
+// parseRobots is a minimal robots.txt parser.
+func parseRobots(content string) []robotsRule {
+	var rules []robotsRule
+	var current *robotsRule
+	for _, line := range strings.Split(content, "\n") {
+		line = strings.TrimSpace(line)
+		if idx := strings.Index(line, "#"); idx >= 0 {
+			line = line[:idx]
+		}
+		if line == "" {
+			if current != nil {
+				rules = append(rules, *current)
+				current = nil
+			}
+			continue
+		}
+		parts := strings.SplitN(line, ":", 2)
+		if len(parts) != 2 {
+			continue
+		}
+		key := strings.TrimSpace(strings.ToLower(parts[0]))
+		val := strings.TrimSpace(parts[1])
+		switch key {
+		case "user-agent":
+			if current == nil {
+				current = &robotsRule{userAgent: val}
+			} else {
+				current.userAgent = val
+			}
+		case "disallow":
+			if current != nil {
+				current.disallow = append(current.disallow, val)
+			}
+		case "allow":
+			if current != nil {
+				current.allow = append(current.allow, val)
+			}
+		}
+	}
+	if current != nil {
+		rules = append(rules, *current)
+	}
+	return rules
+}
+
+// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
+func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
+	var reader io.Reader = r
+	if sizeLimit > 0 {
+		reader = io.LimitReader(r, int64(sizeLimit))
+	}
+
+	// Use golang.org/x/net/html/charset for auto-detection
+	utf8Reader, err := charset.NewReader(reader, contentType)
+	if err != nil {
+		// Fall back to reading raw and hoping for UTF-8
+		data, readErr := io.ReadAll(reader)
+		if readErr != nil {
+			return "", readErr
+		}
+		return string(data), nil
+	}
+	data, err := io.ReadAll(utf8Reader)
+	if err != nil {
+		return "", err
+	}
+	return string(data), nil
+}