// Package crawler implements the HTTP fetching layer with robots.txt compliance, // per-host rate limiting, redirect tracking, and encoding detection. package crawler import ( "fmt" "io" "net/http" "net/url" "strings" "sync" "time" "golang.org/x/net/html/charset" ) // ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…). type ErrCrawl struct { Msg string } func (e *ErrCrawl) Error() string { return e.Msg } // FetchResult bundles the result of a successful fetch. type FetchResult struct { Body string // decoded HTML body FinalURL string // URL after redirects Redirects map[string]string // permanent redirects: from → to ServerType string } // Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting. type Fetcher struct { client *http.Client userAgent string cooldown time.Duration rateMu sync.Mutex lastHit map[string]time.Time // host → last request time robotsMu sync.Mutex robots map[string]*robotsEntry // host → parsed robots } type robotsEntry struct { rules []robotsRule fetchedAt time.Time } type robotsRule struct { userAgent string disallow []string allow []string } // NewFetcher creates a Fetcher with the given user-agent and per-host cooldown. func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher { return &Fetcher{ client: &http.Client{ Timeout: 30 * time.Second, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return fmt.Errorf("too many redirects") } return nil }, }, userAgent: userAgent, cooldown: cooldown, lastHit: make(map[string]time.Time), robots: make(map[string]*robotsEntry), } } // Fetch fetches url, respecting robots.txt and rate limits. // polite=false skips both checks (used by search server snippet fetcher). func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) { return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit) } // FetchSafe wraps Fetch and returns (nil, nil) on expected errors. func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) { res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit) if _, ok := err.(*ErrCrawl); ok { return nil, nil } return res, err } // fetchWithHistory does the actual request and populates redirect history. func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) { parsed, err := url.Parse(rawURL) if err != nil { return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()} } host := parsed.Host if polite { f.rateLimit(host) if !f.robotsAllowed(rawURL, host) { return nil, &ErrCrawl{Msg: "disallowed by robots.txt"} } } redirects := make(map[string]string) client := &http.Client{ Timeout: timeout, CheckRedirect: func(req *http.Request, via []*http.Request) error { if len(via) >= 10 { return fmt.Errorf("too many redirects") } if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) { from := via[len(via)-1].URL.String() to := req.URL.String() redirects[from] = to } return nil }, } req, _ := http.NewRequest("GET", rawURL, nil) req.Header.Set("User-Agent", f.userAgent) resp, err := client.Do(req) if err != nil { return nil, err } defer resp.Body.Close() if resp.StatusCode == 404 { return nil, &ErrCrawl{Msg: "404 not found"} } if resp.StatusCode >= 400 { return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)} } ct := resp.Header.Get("Content-Type") if !strings.Contains(ct, "text/html") { return nil, &ErrCrawl{Msg: "not html: " + ct} } body, err := decodeBody(resp.Body, ct, sizeLimit) if err != nil { return nil, err } return &FetchResult{ Body: body, FinalURL: resp.Request.URL.String(), Redirects: redirects, ServerType: resp.Header.Get("Server"), }, nil } // rateLimit sleeps if the last request to host was too recent. func (f *Fetcher) rateLimit(host string) { f.rateMu.Lock() last, ok := f.lastHit[host] now := time.Now() f.lastHit[host] = now // Periodically prune the map if len(f.lastHit) > 10000 { cutoff := now.Add(-f.cooldown * 2) for k, v := range f.lastHit { if v.Before(cutoff) { delete(f.lastHit, k) } } } f.rateMu.Unlock() if ok { elapsed := now.Sub(last) if elapsed < f.cooldown { time.Sleep(f.cooldown - elapsed) } } } // robotsAllowed returns true if rawURL is crawlable. func (f *Fetcher) robotsAllowed(rawURL, host string) bool { f.robotsMu.Lock() entry, ok := f.robots[host] f.robotsMu.Unlock() if !ok || time.Since(entry.fetchedAt) > 24*time.Hour { entry = f.fetchRobots(host, rawURL) f.robotsMu.Lock() f.robots[host] = entry f.robotsMu.Unlock() } parsed, err := url.Parse(rawURL) if err != nil { return false } path := parsed.Path if path == "" { path = "/" } for _, rule := range entry.rules { if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) { continue } // Check allow first (higher priority) for _, a := range rule.allow { if strings.HasPrefix(path, a) { return true } } for _, dis := range rule.disallow { if dis != "" && strings.HasPrefix(path, dis) { return false } } } return true } // fetchRobots downloads and parses robots.txt for a host. func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry { entry := &robotsEntry{fetchedAt: time.Now()} scheme := "https" if strings.HasPrefix(exampleURL, "http://") { scheme = "http" } robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host) client := &http.Client{Timeout: 5 * time.Second} req, _ := http.NewRequest("GET", robotsURL, nil) req.Header.Set("User-Agent", f.userAgent) resp, err := client.Do(req) if err != nil || resp.StatusCode != 200 { return entry // allow all if robots.txt unavailable } defer resp.Body.Close() body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024)) if err != nil { return entry } entry.rules = parseRobots(string(body)) return entry } // parseRobots is a minimal robots.txt parser. func parseRobots(content string) []robotsRule { var rules []robotsRule var current *robotsRule for _, line := range strings.Split(content, "\n") { line = strings.TrimSpace(line) if idx := strings.Index(line, "#"); idx >= 0 { line = line[:idx] } if line == "" { if current != nil { rules = append(rules, *current) current = nil } continue } parts := strings.SplitN(line, ":", 2) if len(parts) != 2 { continue } key := strings.TrimSpace(strings.ToLower(parts[0])) val := strings.TrimSpace(parts[1]) switch key { case "user-agent": if current == nil { current = &robotsRule{userAgent: val} } else { current.userAgent = val } case "disallow": if current != nil { current.disallow = append(current.disallow, val) } case "allow": if current != nil { current.allow = append(current.allow, val) } } } if current != nil { rules = append(rules, *current) } return rules } // decodeBody reads at most sizeLimit bytes from r, auto-detecting charset. func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) { var reader io.Reader = r if sizeLimit > 0 { reader = io.LimitReader(r, int64(sizeLimit)) } // Use golang.org/x/net/html/charset for auto-detection utf8Reader, err := charset.NewReader(reader, contentType) if err != nil { // Fall back to reading raw and hoping for UTF-8 data, readErr := io.ReadAll(reader) if readErr != nil { return "", readErr } return string(data), nil } data, err := io.ReadAll(utf8Reader) if err != nil { return "", err } return string(data), nil }