Files
sese-engine-go/crawler/fetcher.go
T

314 lines
7.7 KiB
Go

// Package crawler implements the HTTP fetching layer with robots.txt compliance,
// per-host rate limiting, redirect tracking, and encoding detection.
package crawler
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"sync"
"time"
"golang.org/x/net/html/charset"
)
// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
type ErrCrawl struct {
Msg string
}
func (e *ErrCrawl) Error() string { return e.Msg }
// FetchResult bundles the result of a successful fetch.
type FetchResult struct {
Body string // decoded HTML body
FinalURL string // URL after redirects
Redirects map[string]string // permanent redirects: from → to
ServerType string
}
// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
type Fetcher struct {
client *http.Client
userAgent string
cooldown time.Duration
rateMu sync.Mutex
lastHit map[string]time.Time // host → last request time
robotsMu sync.Mutex
robots map[string]*robotsEntry // host → parsed robots
}
type robotsEntry struct {
rules []robotsRule
fetchedAt time.Time
}
type robotsRule struct {
userAgent string
disallow []string
allow []string
}
// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
return &Fetcher{
client: &http.Client{
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
userAgent: userAgent,
cooldown: cooldown,
lastHit: make(map[string]time.Time),
robots: make(map[string]*robotsEntry),
}
}
// Fetch fetches url, respecting robots.txt and rate limits.
// polite=false skips both checks (used by search server snippet fetcher).
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
}
// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
if _, ok := err.(*ErrCrawl); ok {
return nil, nil
}
return res, err
}
// fetchWithHistory does the actual request and populates redirect history.
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
parsed, err := url.Parse(rawURL)
if err != nil {
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
}
host := parsed.Host
if polite {
f.rateLimit(host)
if !f.robotsAllowed(rawURL, host) {
return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
}
}
redirects := make(map[string]string)
client := &http.Client{
Timeout: timeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
from := via[len(via)-1].URL.String()
to := req.URL.String()
redirects[from] = to
}
return nil
},
}
req, _ := http.NewRequest("GET", rawURL, nil)
req.Header.Set("User-Agent", f.userAgent)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == 404 {
return nil, &ErrCrawl{Msg: "404 not found"}
}
if resp.StatusCode >= 400 {
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
}
ct := resp.Header.Get("Content-Type")
if !strings.Contains(ct, "text/html") {
return nil, &ErrCrawl{Msg: "not html: " + ct}
}
body, err := decodeBody(resp.Body, ct, sizeLimit)
if err != nil {
return nil, err
}
return &FetchResult{
Body: body,
FinalURL: resp.Request.URL.String(),
Redirects: redirects,
ServerType: resp.Header.Get("Server"),
}, nil
}
// rateLimit sleeps if the last request to host was too recent.
func (f *Fetcher) rateLimit(host string) {
f.rateMu.Lock()
last, ok := f.lastHit[host]
now := time.Now()
f.lastHit[host] = now
// Periodically prune the map
if len(f.lastHit) > 10000 {
cutoff := now.Add(-f.cooldown * 2)
for k, v := range f.lastHit {
if v.Before(cutoff) {
delete(f.lastHit, k)
}
}
}
f.rateMu.Unlock()
if ok {
elapsed := now.Sub(last)
if elapsed < f.cooldown {
time.Sleep(f.cooldown - elapsed)
}
}
}
// robotsAllowed returns true if rawURL is crawlable.
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
f.robotsMu.Lock()
entry, ok := f.robots[host]
f.robotsMu.Unlock()
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
entry = f.fetchRobots(host, rawURL)
f.robotsMu.Lock()
f.robots[host] = entry
f.robotsMu.Unlock()
}
parsed, err := url.Parse(rawURL)
if err != nil {
return false
}
path := parsed.Path
if path == "" {
path = "/"
}
for _, rule := range entry.rules {
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
continue
}
// Check allow first (higher priority)
for _, a := range rule.allow {
if strings.HasPrefix(path, a) {
return true
}
}
for _, dis := range rule.disallow {
if dis != "" && strings.HasPrefix(path, dis) {
return false
}
}
}
return true
}
// fetchRobots downloads and parses robots.txt for a host.
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
entry := &robotsEntry{fetchedAt: time.Now()}
scheme := "https"
if strings.HasPrefix(exampleURL, "http://") {
scheme = "http"
}
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
client := &http.Client{Timeout: 5 * time.Second}
req, _ := http.NewRequest("GET", robotsURL, nil)
req.Header.Set("User-Agent", f.userAgent)
resp, err := client.Do(req)
if err != nil || resp.StatusCode != 200 {
return entry // allow all if robots.txt unavailable
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
if err != nil {
return entry
}
entry.rules = parseRobots(string(body))
return entry
}
// parseRobots is a minimal robots.txt parser.
func parseRobots(content string) []robotsRule {
var rules []robotsRule
var current *robotsRule
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
if idx := strings.Index(line, "#"); idx >= 0 {
line = line[:idx]
}
if line == "" {
if current != nil {
rules = append(rules, *current)
current = nil
}
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.TrimSpace(strings.ToLower(parts[0]))
val := strings.TrimSpace(parts[1])
switch key {
case "user-agent":
if current == nil {
current = &robotsRule{userAgent: val}
} else {
current.userAgent = val
}
case "disallow":
if current != nil {
current.disallow = append(current.disallow, val)
}
case "allow":
if current != nil {
current.allow = append(current.allow, val)
}
}
}
if current != nil {
rules = append(rules, *current)
}
return rules
}
// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
var reader io.Reader = r
if sizeLimit > 0 {
reader = io.LimitReader(r, int64(sizeLimit))
}
// Use golang.org/x/net/html/charset for auto-detection
utf8Reader, err := charset.NewReader(reader, contentType)
if err != nil {
// Fall back to reading raw and hoping for UTF-8
data, readErr := io.ReadAll(reader)
if readErr != nil {
return "", readErr
}
return string(data), nil
}
data, err := io.ReadAll(utf8Reader)
if err != nil {
return "", err
}
return string(data), nil
}