Signed-off-by: 吴文峰 <kevin@lmve.net>

This commit is contained in:
2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
+588
View File
@@ -0,0 +1,588 @@
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
package crawler
import (
"bytes"
"encoding/json"
"log"
"math"
"math/rand"
"net/http"
"net/url"
"strings"
"sync"
"sync/atomic"
"time"
"sese-engine/analyzer"
"sese-engine/config"
"sese-engine/parser"
"sese-engine/storage"
)
// Stats holds real-time crawl counters (read with atomic).
type Stats struct {
VisitedURLs int64
SuccessURLs int64
KeywordsFetched int64
}
// Crawler orchestrates the BFS crawl.
type Crawler struct {
fetcher *Fetcher
db *storage.DB
analyzer *analyzer.Analyzer
prosperMap map[string]float64 // domain → backlink score (loaded from info)
stats Stats
}
// New creates a Crawler.
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
return &Crawler{
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
db: db,
analyzer: a,
prosperMap: prosperMap,
}
}
// URLWeight pairs a URL with its discovery weight.
type URLWeight struct {
URL string
Weight float64
}
// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
// It blocks until completion.
func (c *Crawler) Run(entryURL string, maxEpoch int) {
visited := make(map[string]bool)
queue := []string{entryURL}
for ep := 0; ep < maxEpoch; ep++ {
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
for _, u := range queue {
visited[u] = true
}
var (
newLinks []URLWeight
mu sync.Mutex
wg sync.WaitGroup
)
sem := make(chan struct{}, config.CrawlerWorkers)
for _, u := range queue {
wg.Add(1)
sem <- struct{}{}
go func(rawURL string) {
defer wg.Done()
defer func() { <-sem }()
hrefs := c.visitURL(rawURL)
n := len(hrefs)
if n > 0 {
w := 1.0 / float64(n)
mu.Lock()
for _, h := range hrefs {
if !visited[h] {
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
}
}
mu.Unlock()
}
}(u)
}
wg.Wait()
if len(newLinks) == 0 {
log.Println("[crawler] empty queue — stopping")
return
}
queue = c.schedule(newLinks)
}
}
// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
func (c *Crawler) visitURL(rawURL string) []string {
atomic.AddInt64(&c.stats.VisitedURLs, 1)
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
if err != nil || res == nil {
c.updateSiteFailure(rawURL)
return nil
}
atomic.AddInt64(&c.stats.SuccessURLs, 1)
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
// Cache snippet
if len(res.FinalURL) < 250 {
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
Title: title,
Description: truncate(desc, 256),
Text: truncate(text, 256),
Timestamp: time.Now().Unix(),
})
}
// Keyword extraction → send to harvester
kws := c.analyzer.Analyze(title, desc, text)
if len(kws) > 0 {
if len(kws) > config.MaxKeywordsPerPage {
kws = kws[:config.MaxKeywordsPerPage]
}
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
go c.sendToHarvester(res.FinalURL, kws)
}
// Update site info
host := netloc(res.FinalURL)
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
// Handle permanent redirects in site info
for from, to := range res.Redirects {
fromHost := netloc(from)
if fromHost == "" {
continue
}
info, _ := c.db.GetSiteInfo(fromHost)
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
info.Redirects[from] = to
if len(info.Redirects) > 50 {
// keep most important (just truncate randomly for now)
info.Redirects = truncateMap(info.Redirects, 40)
}
_ = c.db.SetSiteInfo(fromHost, info)
}
// Trim hrefs
if len(hrefs) > 100 {
hrefs = sampleStrings(hrefs, 100)
}
return hrefs
}
func (c *Crawler) updateSiteFailure(rawURL string) {
host := netloc(rawURL)
if host == "" {
return
}
info, _ := c.db.GetSiteInfo(host)
if info.SuccessRate == nil {
zero := 0.0
info.SuccessRate = &zero
}
*info.SuccessRate *= 0.99
_ = c.db.SetSiteInfo(host, info)
}
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
info, _ := c.db.GetSiteInfo(host)
info.VisitCount++
info.LastVisitTime = time.Now().Unix()
one := 1.0
if info.SuccessRate == nil {
info.SuccessRate = &one
}
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
if strings.HasPrefix(res.FinalURL, "https://") {
t := true
info.HTTPSAvailable = &t
}
if res.ServerType != "" {
found := false
for _, s := range info.ServerTypes {
if s == res.ServerType {
found = true
break
}
}
if !found {
info.ServerTypes = append(info.ServerTypes, res.ServerType)
if len(info.ServerTypes) > 5 {
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
}
}
}
// Language detection — sample 10% or first 10 visits
if info.VisitCount < 10 || rand.Float64() < 0.1 {
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
if lang != "" {
if info.Languages == nil {
info.Languages = make(map[string]float64)
}
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
for k := range info.Languages {
info.Languages[k] *= (1 - intensity)
}
info.Languages[lang] += intensity
}
// Collect external links
superHost := superNetloc(res.FinalURL)
var external []string
for _, h := range hrefs {
if superNetloc(h) != superHost {
external = append(external, h)
}
}
sampled := sampleStrings(external, 10)
info.OutLinks = append(info.OutLinks, sampled...)
if len(info.OutLinks) > 250 {
info.OutLinks = sampleStrings(info.OutLinks, 200)
}
}
_ = c.db.SetSiteInfo(host, info)
}
// sendToHarvester POSTs keyword data to the harvester service.
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
type payload struct {
URL string `json:"url"`
Keywords []analyzer.Keyword `json:"keywords"`
}
p := payload{URL: finalURL, Keywords: kws}
data, err := json.Marshal(p)
if err != nil {
return
}
resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data))
if err != nil {
log.Printf("[crawler] harvester post failed: %v", err)
return
}
resp.Body.Close()
}
// schedule selects and prioritises the next BFS queue from raw discovered links.
func (c *Crawler) schedule(links []URLWeight) []string {
if len(links) > 100000 {
links = sampleURLWeights(links, 100000)
}
// Pre-fetch site info for all involved domains
domains := make(map[string]bool)
for _, lw := range links {
if h := netloc(lw.URL); h != "" {
domains[h] = true
}
if h := superNetloc(lw.URL); h != "" {
domains[h] = true
}
}
siteCache := make(map[string]*storage.SiteInfo, len(domains))
var mu sync.Mutex
var wg sync.WaitGroup
for d := range domains {
wg.Add(1)
go func(host string) {
defer wg.Done()
info, _ := c.db.GetSiteInfo(host)
mu.Lock()
siteCache[host] = info
mu.Unlock()
}(d)
}
wg.Wait()
// Score each URL
scored_list := make([]scoredURL, len(links))
for i, lw := range links {
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
}
// Weighted random sample (45000 or 1/3+250 whichever smaller)
k := min(45000, len(scored_list)/3+250)
selected := weightedSample(scored_list, k)
// Domain concentration filtering
selected = concentrationFilter(selected, config.CrawlFocus)
// Separate https/http, cap http at 1/4 of https count
var httpsURLs, httpURLs []string
for _, s := range selected {
if strings.HasPrefix(s, "https://") {
httpsURLs = append(httpsURLs, s)
} else {
httpURLs = append(httpURLs, s)
}
}
maxHTTP := len(httpsURLs) / 4
if len(httpURLs) > maxHTTP {
httpURLs = sampleStrings(httpURLs, maxHTTP)
}
// Separate prosperous / non-prosperous
var prosperURLs, otherURLs []string
for _, u := range append(httpsURLs, httpURLs...) {
if c.prosperMap[netloc(u)] > 0 {
prosperURLs = append(prosperURLs, u)
} else {
otherURLs = append(otherURLs, u)
}
}
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
if len(otherURLs) > n {
keep := max(len(otherURLs)-len(selected)/10, n)
if keep < len(otherURLs) {
otherURLs = sampleStrings(otherURLs, keep)
}
}
result := append(prosperURLs, otherURLs...)
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
return result
}
// scoreURL computes the scheduling priority for a URL.
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
host := netloc(lw.URL)
super := superNetloc(lw.URL)
info := siteCache[host]
if info == nil {
info = &storage.SiteInfo{}
}
// Chinese-ness
var chineseness float64 = 0.5
if len(info.Languages) > 0 {
total := 0.0
for _, v := range info.Languages {
total += v
}
if total > 0 {
chineseness = info.Languages["zh"] / total
}
}
// Interest decay based on visit count
prosper := math.Min(62, c.prosperMap[host])
limit := prosper*500 + 50
b := math.Pow(0.1, 1/limit)
interest := math.Pow(b, float64(info.VisitCount))
var interest2 float64 = 1.0
if super != host {
superInfo := siteCache[super]
if superInfo != nil {
limit2 := math.Min(62, c.prosperMap[super])*500 + 50
b2 := math.Pow(0.1, 1/limit2)
interest2 = math.Pow(b2, float64(superInfo.VisitCount))
}
}
quality := 1.0
if info.Quality != nil {
quality = *info.Quality
}
prosperity := prosper
if prosperity > 0 {
prosperity += 0.5
}
prosperity = math.Log2(2+prosperity) + 1
bad := badURL(lw.URL)
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
}
// ---- helper functions ----
func netloc(rawURL string) string {
parts := strings.SplitN(rawURL, "/", 4)
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
return parts[2]
}
u, err := url.Parse(rawURL)
if err != nil {
return ""
}
return u.Host
}
// superNetloc returns "domain.tld" (strips subdomains).
func superNetloc(rawURL string) string {
host := netloc(rawURL)
parts := strings.Split(host, ".")
if len(parts) >= 2 {
return strings.Join(parts[len(parts)-2:], ".")
}
return host
}
func badURL(u string) float64 {
s := math.Max(0, float64(len(u)-30)/200.0)
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
s += (1 - s) * 0.3
}
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
s += (1 - s) * 0.1
}
if len(u) < 5 || u[4] == ':' {
s += (1 - s) * 0.3
}
return math.Min(s, 0.9)
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n]
}
func sampleStrings(s []string, n int) []string {
if len(s) <= n {
return s
}
perm := rand.Perm(len(s))
out := make([]string, n)
for i := range out {
out[i] = s[perm[i]]
}
return out
}
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
if len(s) <= n {
return s
}
perm := rand.Perm(len(s))
out := make([]URLWeight, n)
for i := range out {
out[i] = s[perm[i]]
}
return out
}
type scoredURL struct {
url string
score float64
}
func weightedSample(items []scoredURL, k int) []string {
if k >= len(items) {
out := make([]string, len(items))
for i, s := range items {
out[i] = s.url
}
return out
}
// Simple weighted sampling without replacement using alias method approximation
totalWeight := 0.0
for _, s := range items {
totalWeight += s.score
}
selected := make(map[int]bool)
out := make([]string, 0, k)
for len(out) < k && len(selected) < len(items) {
r := rand.Float64() * totalWeight
cum := 0.0
for i, s := range items {
if selected[i] {
continue
}
cum += s.score
if cum >= r {
selected[i] = true
out = append(out, s.url)
totalWeight -= s.score
break
}
}
}
return out
}
func concentrationFilter(urls []string, k float64) []string {
domainGroups := make(map[string][]string)
shuffled := make([]string, len(urls))
copy(shuffled, urls)
rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })
for _, u := range shuffled {
d := superNetloc(u)
domainGroups[d] = append(domainGroups[d], u)
}
limit := 10
if len(domainGroups) > 1 {
sizes := make([]int, 0, len(domainGroups))
for _, g := range domainGroups {
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
}
// sort sizes ascending, drop last (largest)
for i := 0; i < len(sizes)-1; i++ {
for j := i + 1; j < len(sizes)-1; j++ {
if sizes[j] < sizes[i] {
sizes[i], sizes[j] = sizes[j], sizes[i]
}
}
}
total := 0
for _, s := range sizes[:len(sizes)-1] {
total += s
}
limit = max(10, int(float64(total)*0.6))
}
var result []string
for _, g := range domainGroups {
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
if sn > len(g) {
sn = len(g)
}
result = append(result, g[:sn]...)
}
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
return result
}
func truncateMap(m map[string]string, n int) map[string]string {
if len(m) <= n {
return m
}
out := make(map[string]string, n)
i := 0
for k, v := range m {
if i >= n {
break
}
out[k] = v
i++
}
return out
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
// Expose Stats for monitoring.
func (c *Crawler) GetStats() Stats {
return Stats{
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
SuccessURLs: atomic.LoadInt64(&c.stats.SuccessURLs),
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
}
}
+313
View File
@@ -0,0 +1,313 @@
// Package crawler implements the HTTP fetching layer with robots.txt compliance,
// per-host rate limiting, redirect tracking, and encoding detection.
package crawler
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"sync"
"time"
"golang.org/x/net/html/charset"
)
// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
type ErrCrawl struct {
Msg string
}
func (e *ErrCrawl) Error() string { return e.Msg }
// FetchResult bundles the result of a successful fetch.
type FetchResult struct {
Body string // decoded HTML body
FinalURL string // URL after redirects
Redirects map[string]string // permanent redirects: from → to
ServerType string
}
// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
type Fetcher struct {
client *http.Client
userAgent string
cooldown time.Duration
rateMu sync.Mutex
lastHit map[string]time.Time // host → last request time
robotsMu sync.Mutex
robots map[string]*robotsEntry // host → parsed robots
}
type robotsEntry struct {
rules []robotsRule
fetchedAt time.Time
}
type robotsRule struct {
userAgent string
disallow []string
allow []string
}
// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
return &Fetcher{
client: &http.Client{
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
userAgent: userAgent,
cooldown: cooldown,
lastHit: make(map[string]time.Time),
robots: make(map[string]*robotsEntry),
}
}
// Fetch fetches url, respecting robots.txt and rate limits.
// polite=false skips both checks (used by search server snippet fetcher).
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
}
// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
if _, ok := err.(*ErrCrawl); ok {
return nil, nil
}
return res, err
}
// fetchWithHistory does the actual request and populates redirect history.
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
parsed, err := url.Parse(rawURL)
if err != nil {
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
}
host := parsed.Host
if polite {
f.rateLimit(host)
if !f.robotsAllowed(rawURL, host) {
return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
}
}
redirects := make(map[string]string)
client := &http.Client{
Timeout: timeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
from := via[len(via)-1].URL.String()
to := req.URL.String()
redirects[from] = to
}
return nil
},
}
req, _ := http.NewRequest("GET", rawURL, nil)
req.Header.Set("User-Agent", f.userAgent)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == 404 {
return nil, &ErrCrawl{Msg: "404 not found"}
}
if resp.StatusCode >= 400 {
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
}
ct := resp.Header.Get("Content-Type")
if !strings.Contains(ct, "text/html") {
return nil, &ErrCrawl{Msg: "not html: " + ct}
}
body, err := decodeBody(resp.Body, ct, sizeLimit)
if err != nil {
return nil, err
}
return &FetchResult{
Body: body,
FinalURL: resp.Request.URL.String(),
Redirects: redirects,
ServerType: resp.Header.Get("Server"),
}, nil
}
// rateLimit sleeps if the last request to host was too recent.
func (f *Fetcher) rateLimit(host string) {
f.rateMu.Lock()
last, ok := f.lastHit[host]
now := time.Now()
f.lastHit[host] = now
// Periodically prune the map
if len(f.lastHit) > 10000 {
cutoff := now.Add(-f.cooldown * 2)
for k, v := range f.lastHit {
if v.Before(cutoff) {
delete(f.lastHit, k)
}
}
}
f.rateMu.Unlock()
if ok {
elapsed := now.Sub(last)
if elapsed < f.cooldown {
time.Sleep(f.cooldown - elapsed)
}
}
}
// robotsAllowed returns true if rawURL is crawlable.
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
f.robotsMu.Lock()
entry, ok := f.robots[host]
f.robotsMu.Unlock()
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
entry = f.fetchRobots(host, rawURL)
f.robotsMu.Lock()
f.robots[host] = entry
f.robotsMu.Unlock()
}
parsed, err := url.Parse(rawURL)
if err != nil {
return false
}
path := parsed.Path
if path == "" {
path = "/"
}
for _, rule := range entry.rules {
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
continue
}
// Check allow first (higher priority)
for _, a := range rule.allow {
if strings.HasPrefix(path, a) {
return true
}
}
for _, dis := range rule.disallow {
if dis != "" && strings.HasPrefix(path, dis) {
return false
}
}
}
return true
}
// fetchRobots downloads and parses robots.txt for a host.
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
entry := &robotsEntry{fetchedAt: time.Now()}
scheme := "https"
if strings.HasPrefix(exampleURL, "http://") {
scheme = "http"
}
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
client := &http.Client{Timeout: 5 * time.Second}
req, _ := http.NewRequest("GET", robotsURL, nil)
req.Header.Set("User-Agent", f.userAgent)
resp, err := client.Do(req)
if err != nil || resp.StatusCode != 200 {
return entry // allow all if robots.txt unavailable
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
if err != nil {
return entry
}
entry.rules = parseRobots(string(body))
return entry
}
// parseRobots is a minimal robots.txt parser.
func parseRobots(content string) []robotsRule {
var rules []robotsRule
var current *robotsRule
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
if idx := strings.Index(line, "#"); idx >= 0 {
line = line[:idx]
}
if line == "" {
if current != nil {
rules = append(rules, *current)
current = nil
}
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.TrimSpace(strings.ToLower(parts[0]))
val := strings.TrimSpace(parts[1])
switch key {
case "user-agent":
if current == nil {
current = &robotsRule{userAgent: val}
} else {
current.userAgent = val
}
case "disallow":
if current != nil {
current.disallow = append(current.disallow, val)
}
case "allow":
if current != nil {
current.allow = append(current.allow, val)
}
}
}
if current != nil {
rules = append(rules, *current)
}
return rules
}
// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
var reader io.Reader = r
if sizeLimit > 0 {
reader = io.LimitReader(r, int64(sizeLimit))
}
// Use golang.org/x/net/html/charset for auto-detection
utf8Reader, err := charset.NewReader(reader, contentType)
if err != nil {
// Fall back to reading raw and hoping for UTF-8
data, readErr := io.ReadAll(reader)
if readErr != nil {
return "", readErr
}
return string(data), nil
}
data, err := io.ReadAll(utf8Reader)
if err != nil {
return "", err
}
return string(data), nil
}