加上中文注释
This commit is contained in:
+125
-70
@@ -1,43 +1,44 @@
|
||||
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
|
||||
// crawler 包的主逻辑:BFS 爬取循环、URL 调度算法、网站元信息更新。
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
"bytes" // 字节缓冲(构造 HTTP POST 请求体)
|
||||
"encoding/json" // JSON 序列化(发送关键词数据到 harvester)
|
||||
"log" // 日志输出
|
||||
"math" // 数学运算(指数衰减、质量评分)
|
||||
"math/rand" // 随机数(加权采样、队列打乱)
|
||||
"net/http" // HTTP 客户端(POST 数据到 harvester)
|
||||
"net/url" // URL 解析
|
||||
"strings" // 字符串操作
|
||||
"sync" // 互斥锁(保护并发收集结果)
|
||||
"sync/atomic" // 原子操作(计数器,无锁并发更新)
|
||||
"time" // 时间戳
|
||||
|
||||
"sese-engine/analyzer"
|
||||
"sese-engine/config"
|
||||
"sese-engine/parser"
|
||||
"sese-engine/storage"
|
||||
"sese-engine/analyzer" // 文本分析和关键词提取
|
||||
"sese-engine/config" // 全局配置常量
|
||||
"sese-engine/parser" // HTML 解析(提取标题、正文、链接)
|
||||
"sese-engine/storage" // 持久化存储
|
||||
)
|
||||
|
||||
|
||||
// Stats holds real-time crawl counters (read with atomic).
|
||||
// Stats 存放爬虫实时统计计数器(使用 atomic 原子读取)。
|
||||
type Stats struct {
|
||||
VisitedURLs int64
|
||||
SuccessURLs int64
|
||||
KeywordsFetched int64
|
||||
VisitedURLs int64 // 已访问的 URL 总数(含失败)
|
||||
SuccessURLs int64 // 成功抓取(HTTP 200)的 URL 数
|
||||
KeywordsFetched int64 // 累计提取的关键词总数
|
||||
}
|
||||
|
||||
// Crawler orchestrates the BFS crawl.
|
||||
// Crawler 编排整个 BFS 爬取流程。
|
||||
type Crawler struct {
|
||||
fetcher *Fetcher
|
||||
db *storage.DB
|
||||
analyzer *analyzer.Analyzer
|
||||
prosperMap map[string]float64 // domain → backlink score (loaded from info)
|
||||
stats Stats
|
||||
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
|
||||
db *storage.DB // 持久化数据库
|
||||
analyzer *analyzer.Analyzer // 分词和关键词分析
|
||||
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
|
||||
stats Stats // 原子计数器
|
||||
}
|
||||
|
||||
// New creates a Crawler.
|
||||
// New 创建一个 Crawler 实例。
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
return &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
||||
@@ -47,40 +48,46 @@ func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *C
|
||||
}
|
||||
}
|
||||
|
||||
// URLWeight pairs a URL with its discovery weight.
|
||||
// URLWeight 将 URL 和发现权重打包在一起,用于调度决策。
|
||||
type URLWeight struct {
|
||||
URL string
|
||||
Weight float64
|
||||
URL string // 待访问的 URL
|
||||
Weight float64 // 发现权重(从父页面分得的"关注度",页面链接越多则每个分得越少)
|
||||
}
|
||||
|
||||
// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
|
||||
// It blocks until completion.
|
||||
// Run 启动 BFS 爬取,从 entryURL 开始,执行最多 maxEpoch 轮。
|
||||
// 各轮之间是串行的,每轮内并发抓取,按调度算法选择下一轮 URL。
|
||||
func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
visited := make(map[string]bool)
|
||||
queue := []string{entryURL}
|
||||
visited := make(map[string]bool) // 已访问 URL 集合(防止重复抓取)
|
||||
queue := []string{entryURL} // 当前轮次的待抓取队列
|
||||
|
||||
for ep := 0; ep < maxEpoch; ep++ {
|
||||
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
|
||||
// 将本轮所有 URL 标记为已访问(防止下一轮重复入队)
|
||||
for _, u := range queue {
|
||||
visited[u] = true
|
||||
}
|
||||
|
||||
// 并发抓取本轮所有 URL
|
||||
var (
|
||||
newLinks []URLWeight
|
||||
mu sync.Mutex
|
||||
newLinks []URLWeight // 收集下一轮候选 URL
|
||||
mu sync.Mutex // 保护 newLinks 的并发写入
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
|
||||
// 信号量:限制同时并发数不超过配置的工作线程数
|
||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
||||
for _, u := range queue {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||
go func(rawURL string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
defer func() { <-sem }() // 释放令牌
|
||||
|
||||
// 抓取单个 URL,返回发现的子链接
|
||||
hrefs := c.visitURL(rawURL)
|
||||
n := len(hrefs)
|
||||
if n > 0 {
|
||||
// 每个子链接分得 1/n 的父页面权重
|
||||
w := 1.0 / float64(n)
|
||||
mu.Lock()
|
||||
for _, h := range hrefs {
|
||||
@@ -94,30 +101,34 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// 本轮没有发现新链接,爬取结束
|
||||
if len(newLinks) == 0 {
|
||||
log.Println("[crawler] empty queue — stopping")
|
||||
return
|
||||
}
|
||||
|
||||
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
|
||||
queue = c.schedule(newLinks)
|
||||
}
|
||||
}
|
||||
|
||||
// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
|
||||
// visitURL 抓取一个 URL,提取关键词、缓存摘要、更新网站元信息,返回页面中发现的子链接。
|
||||
func (c *Crawler) visitURL(rawURL string) []string {
|
||||
atomic.AddInt64(&c.stats.VisitedURLs, 1)
|
||||
atomic.AddInt64(&c.stats.VisitedURLs, 1) // 计数器 +1
|
||||
|
||||
// 礼貌模式抓取(遵守 robots.txt + 限流),超时 10 秒,不限制大小
|
||||
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
|
||||
if err != nil || res == nil {
|
||||
c.updateSiteFailure(rawURL)
|
||||
c.updateSiteFailure(rawURL) // 记录失败,更新该网站成功率
|
||||
return nil
|
||||
}
|
||||
|
||||
atomic.AddInt64(&c.stats.SuccessURLs, 1)
|
||||
atomic.AddInt64(&c.stats.SuccessURLs, 1) // 成功计数器 +1
|
||||
|
||||
// 解析 HTML:提取标题、描述、正文和所有超链接
|
||||
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
|
||||
|
||||
// Cache snippet
|
||||
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||
if len(res.FinalURL) < 250 {
|
||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
Title: title,
|
||||
@@ -127,21 +138,23 @@ func (c *Crawler) visitURL(rawURL string) []string {
|
||||
})
|
||||
}
|
||||
|
||||
// Keyword extraction → send to harvester
|
||||
// 关键词提取:将标题/描述/正文交给 analyzer 计算关键词权重
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
// 限制每个页面最多发送的关键词数量
|
||||
if len(kws) > config.MaxKeywordsPerPage {
|
||||
kws = kws[:config.MaxKeywordsPerPage]
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||
go c.sendToHarvester(res.FinalURL, kws)
|
||||
}
|
||||
|
||||
// Update site info
|
||||
// 更新网站元信息(成功访问)
|
||||
host := netloc(res.FinalURL)
|
||||
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
|
||||
|
||||
// Handle permanent redirects in site info
|
||||
// 处理永久重定向:更新源主机名下的重定向映射
|
||||
for from, to := range res.Redirects {
|
||||
fromHost := netloc(from)
|
||||
if fromHost == "" {
|
||||
@@ -152,20 +165,21 @@ func (c *Crawler) visitURL(rawURL string) []string {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
info.Redirects[from] = to
|
||||
// 重定向映射过多时裁剪到 40 条
|
||||
if len(info.Redirects) > 50 {
|
||||
// keep most important (just truncate randomly for now)
|
||||
info.Redirects = truncateMap(info.Redirects, 40)
|
||||
}
|
||||
_ = c.db.SetSiteInfo(fromHost, info)
|
||||
}
|
||||
|
||||
// Trim hrefs
|
||||
// 限制返回的链接数,防止下一轮队列爆炸
|
||||
if len(hrefs) > 100 {
|
||||
hrefs = sampleStrings(hrefs, 100)
|
||||
}
|
||||
return hrefs
|
||||
}
|
||||
|
||||
// updateSiteFailure 当某 URL 抓取失败时,更新该网站的访问成功率(指数衰减)。
|
||||
func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
host := netloc(rawURL)
|
||||
if host == "" {
|
||||
@@ -176,27 +190,33 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
}
|
||||
// 成功率每次失败乘以 0.99(无限趋近 0)
|
||||
*info.SuccessRate *= 0.99
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
// updateSiteSuccess 当某 URL 抓取成功时,更新网站的完整元信息。
|
||||
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
|
||||
// 访问计数 +1,更新最后访问时间
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = time.Now().Unix()
|
||||
|
||||
// 成功率更新:EWM(指数加权移动)平滑,每次 +0.01
|
||||
one := 1.0
|
||||
if info.SuccessRate == nil {
|
||||
info.SuccessRate = &one
|
||||
}
|
||||
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
||||
|
||||
// 记录是否支持 HTTPS
|
||||
if strings.HasPrefix(res.FinalURL, "https://") {
|
||||
t := true
|
||||
info.HTTPSAvailable = &t
|
||||
}
|
||||
|
||||
// 记录 HTTP Server 类型(去重,保留最近 5 个)
|
||||
if res.ServerType != "" {
|
||||
found := false
|
||||
for _, s := range info.ServerTypes {
|
||||
@@ -213,20 +233,22 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
||||
}
|
||||
}
|
||||
|
||||
// Language detection — sample 10% or first 10 visits
|
||||
// 语言检测和出站链接收集(仅在前 10 次访问或 10% 概率下触发,减少开销)
|
||||
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
||||
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
||||
if lang != "" {
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
// 首次访问强度高,随访问次数增加强度衰减
|
||||
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
||||
for k := range info.Languages {
|
||||
info.Languages[k] *= (1 - intensity)
|
||||
info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
|
||||
}
|
||||
info.Languages[lang] += intensity
|
||||
info.Languages[lang] += intensity // 新语种增加
|
||||
}
|
||||
// Collect external links
|
||||
|
||||
// 收集外链(跨顶级域名的链接)
|
||||
superHost := superNetloc(res.FinalURL)
|
||||
var external []string
|
||||
for _, h := range hrefs {
|
||||
@@ -234,8 +256,10 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
||||
external = append(external, h)
|
||||
}
|
||||
}
|
||||
// 最多保留 10 条外链
|
||||
sampled := sampleStrings(external, 10)
|
||||
info.OutLinks = append(info.OutLinks, sampled...)
|
||||
// 外链超过 250 条时采样到 200 条
|
||||
if len(info.OutLinks) > 250 {
|
||||
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
||||
}
|
||||
@@ -244,10 +268,10 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
// sendToHarvester POSTs keyword data to the harvester service.
|
||||
// sendToHarvester 将关键词索引数据通过 HTTP POST 发送到收获服务器(:5000/l 端点)。
|
||||
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
type payload struct {
|
||||
URL string `json:"url"`
|
||||
URL string `json:"url"`
|
||||
Keywords []analyzer.Keyword `json:"keywords"`
|
||||
}
|
||||
p := payload{URL: finalURL, Keywords: kws}
|
||||
@@ -263,13 +287,15 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// schedule selects and prioritises the next BFS queue from raw discovered links.
|
||||
// schedule 从候选 URL 集合中选出下一轮 BFS 队列。
|
||||
// 包含:域名集中度过滤、HTTP/HTTPS 比例控制、繁荣 URL 占比控制、加权随机采样。
|
||||
func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
// 候选过多时先随机采样到 10 万条,防止内存爆炸
|
||||
if len(links) > 100000 {
|
||||
links = sampleURLWeights(links, 100000)
|
||||
}
|
||||
|
||||
// Pre-fetch site info for all involved domains
|
||||
// 预加载所有涉及的网站信息(加速后续评分计算)
|
||||
domains := make(map[string]bool)
|
||||
for _, lw := range links {
|
||||
if h := netloc(lw.URL); h != "" {
|
||||
@@ -294,20 +320,20 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Score each URL
|
||||
// 对所有候选 URL 逐一计算调度优先级分数
|
||||
scored_list := make([]scoredURL, len(links))
|
||||
for i, lw := range links {
|
||||
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
|
||||
}
|
||||
|
||||
// Weighted random sample (45000 or 1/3+250 whichever smaller)
|
||||
// 加权随机采样:从高分到低分按权重概率抽取最多 k 条
|
||||
k := min(45000, len(scored_list)/3+250)
|
||||
selected := weightedSample(scored_list, k)
|
||||
|
||||
// Domain concentration filtering
|
||||
// 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断
|
||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
||||
|
||||
// Separate https/http, cap http at 1/4 of https count
|
||||
// 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4
|
||||
var httpsURLs, httpURLs []string
|
||||
for _, s := range selected {
|
||||
if strings.HasPrefix(s, "https://") {
|
||||
@@ -321,7 +347,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
httpURLs = sampleStrings(httpURLs, maxHTTP)
|
||||
}
|
||||
|
||||
// Separate prosperous / non-prosperous
|
||||
// 分离繁荣(高反向链接)域名和普通域名,按比例控制繁荣 URL 占比
|
||||
var prosperURLs, otherURLs []string
|
||||
for _, u := range append(httpsURLs, httpURLs...) {
|
||||
if c.prosperMap[netloc(u)] > 0 {
|
||||
@@ -330,6 +356,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
otherURLs = append(otherURLs, u)
|
||||
}
|
||||
}
|
||||
// 根据目标繁荣占比计算普通 URL 应保留数量
|
||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
||||
if len(otherURLs) > n {
|
||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||
@@ -338,12 +365,14 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
}
|
||||
}
|
||||
|
||||
// 合并并随机打乱(使繁荣 URL 和普通 URL 混合)
|
||||
result := append(prosperURLs, otherURLs...)
|
||||
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
||||
return result
|
||||
}
|
||||
|
||||
// scoreURL computes the scheduling priority for a URL.
|
||||
// scoreURL 计算单个 URL 的调度优先级分数。
|
||||
// 综合考虑:中文语种权重、域名访问历史衰减、网站质量评分、繁荣值、URL 本身质量。
|
||||
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
|
||||
host := netloc(lw.URL)
|
||||
super := superNetloc(lw.URL)
|
||||
@@ -353,7 +382,7 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
|
||||
info = &storage.SiteInfo{}
|
||||
}
|
||||
|
||||
// Chinese-ness
|
||||
// 中文倾向性:该网站中文内容占比
|
||||
var chineseness float64 = 0.5
|
||||
if len(info.Languages) > 0 {
|
||||
total := 0.0
|
||||
@@ -365,12 +394,13 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
|
||||
}
|
||||
}
|
||||
|
||||
// Interest decay based on visit count
|
||||
// 兴趣衰减:基于访问次数的指数衰减,繁荣域名可访问更多次
|
||||
prosper := math.Min(62, c.prosperMap[host])
|
||||
limit := prosper*500 + 50
|
||||
b := math.Pow(0.1, 1/limit)
|
||||
interest := math.Pow(b, float64(info.VisitCount))
|
||||
|
||||
// 同理对顶级域名计算衰减(二级域名不够用时看顶级域名)
|
||||
var interest2 float64 = 1.0
|
||||
if super != host {
|
||||
superInfo := siteCache[super]
|
||||
@@ -381,23 +411,28 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
|
||||
}
|
||||
}
|
||||
|
||||
// 网站质量评分
|
||||
quality := 1.0
|
||||
if info.Quality != nil {
|
||||
quality = *info.Quality
|
||||
}
|
||||
|
||||
// 繁荣值加分(log 变换平滑)
|
||||
prosperity := prosper
|
||||
if prosperity > 0 {
|
||||
prosperity += 0.5
|
||||
}
|
||||
prosperity = math.Log2(2+prosperity) + 1
|
||||
|
||||
// URL 本身的质量惩罚(过长、路径过深、使用 .php/.htm 等)
|
||||
bad := badURL(lw.URL)
|
||||
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
|
||||
}
|
||||
|
||||
// ---- helper functions ----
|
||||
// ---- 辅助函数 ----
|
||||
|
||||
// netloc 从原始 URL 字符串提取主机名(不含路径)。
|
||||
// 支持 http:// 和 https:// 前缀,自动处理 URL 解析异常。
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
@@ -410,7 +445,8 @@ func netloc(rawURL string) string {
|
||||
return u.Host
|
||||
}
|
||||
|
||||
// superNetloc returns "domain.tld" (strips subdomains).
|
||||
// superNetloc 返回顶级域名(去除子域名),例如 "www.example.com" → "example.com"。
|
||||
// 用于识别跨子域名但同主站的情况。
|
||||
func superNetloc(rawURL string) string {
|
||||
host := netloc(rawURL)
|
||||
parts := strings.Split(host, ".")
|
||||
@@ -420,20 +456,26 @@ func superNetloc(rawURL string) string {
|
||||
return host
|
||||
}
|
||||
|
||||
// badURL 返回 URL 的"劣质"评分(0~0.9),基于长度、路径深度、文件扩展名等特征。
|
||||
func badURL(u string) float64 {
|
||||
// URL 过长惩罚
|
||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||
// 使用 .htm/.php 等动态页面惩罚
|
||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
// 路径层级过深惩罚(超过 2 层斜杠)
|
||||
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
||||
s += (1 - s) * 0.1
|
||||
}
|
||||
// 极短 URL 或协议后冒号(如 ftp:)惩罚
|
||||
if len(u) < 5 || u[4] == ':' {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
return math.Min(s, 0.9)
|
||||
}
|
||||
|
||||
// truncate 将字符串截断到最多 n 个字符。
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
@@ -441,6 +483,7 @@ func truncate(s string, n int) string {
|
||||
return s[:n]
|
||||
}
|
||||
|
||||
// sampleStrings 从字符串切片中随机不重复抽取 n 条。
|
||||
func sampleStrings(s []string, n int) []string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
@@ -453,6 +496,7 @@ func sampleStrings(s []string, n int) []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// sampleURLWeights 与 sampleStrings 相同,但处理 URLWeight 切片。
|
||||
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
@@ -465,11 +509,14 @@ func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
||||
return out
|
||||
}
|
||||
|
||||
// scoredURL 内部用结构体,存储 URL 和对应调度分数。
|
||||
type scoredURL struct {
|
||||
url string
|
||||
score float64
|
||||
}
|
||||
|
||||
// weightedSample 加权随机采样(不放回):从 scoredURL 列表中按权重概率抽取最多 k 条。
|
||||
// 使用累积概率法近似 alias method(适合中等规模数据)。
|
||||
func weightedSample(items []scoredURL, k int) []string {
|
||||
if k >= len(items) {
|
||||
out := make([]string, len(items))
|
||||
@@ -478,7 +525,6 @@ func weightedSample(items []scoredURL, k int) []string {
|
||||
}
|
||||
return out
|
||||
}
|
||||
// Simple weighted sampling without replacement using alias method approximation
|
||||
totalWeight := 0.0
|
||||
for _, s := range items {
|
||||
totalWeight += s.score
|
||||
@@ -486,6 +532,7 @@ func weightedSample(items []scoredURL, k int) []string {
|
||||
selected := make(map[int]bool)
|
||||
out := make([]string, 0, k)
|
||||
for len(out) < k && len(selected) < len(items) {
|
||||
// 随机取 [0, totalWeight) 区间的一个点
|
||||
r := rand.Float64() * totalWeight
|
||||
cum := 0.0
|
||||
for i, s := range items {
|
||||
@@ -496,7 +543,7 @@ func weightedSample(items []scoredURL, k int) []string {
|
||||
if cum >= r {
|
||||
selected[i] = true
|
||||
out = append(out, s.url)
|
||||
totalWeight -= s.score
|
||||
totalWeight -= s.score // 被选中后从总权重中移除(不放回)
|
||||
break
|
||||
}
|
||||
}
|
||||
@@ -504,7 +551,10 @@ func weightedSample(items []scoredURL, k int) []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// concentrationFilter 域名集中度过滤。
|
||||
// 按 CrawlFocus 因子限制每个顶级域名被选中的 URL 数量,防止爬取过于集中在少数网站。
|
||||
func concentrationFilter(urls []string, k float64) []string {
|
||||
// 按顶级域名分组
|
||||
domainGroups := make(map[string][]string)
|
||||
shuffled := make([]string, len(urls))
|
||||
copy(shuffled, urls)
|
||||
@@ -515,13 +565,14 @@ func concentrationFilter(urls []string, k float64) []string {
|
||||
domainGroups[d] = append(domainGroups[d], u)
|
||||
}
|
||||
|
||||
// 计算每组保留上限:域名规模越大允许越多,但按 k 次幂压制
|
||||
limit := 10
|
||||
if len(domainGroups) > 1 {
|
||||
sizes := make([]int, 0, len(domainGroups))
|
||||
for _, g := range domainGroups {
|
||||
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
|
||||
}
|
||||
// sort sizes ascending, drop last (largest)
|
||||
// 升序排列,去除最大一项,用其余项总和的 60% 作为全局上限
|
||||
for i := 0; i < len(sizes)-1; i++ {
|
||||
for j := i + 1; j < len(sizes)-1; j++ {
|
||||
if sizes[j] < sizes[i] {
|
||||
@@ -536,6 +587,7 @@ func concentrationFilter(urls []string, k float64) []string {
|
||||
limit = max(10, int(float64(total)*0.6))
|
||||
}
|
||||
|
||||
// 从每组中按计算的上限采样
|
||||
var result []string
|
||||
for _, g := range domainGroups {
|
||||
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
|
||||
@@ -548,6 +600,7 @@ func concentrationFilter(urls []string, k float64) []string {
|
||||
return result
|
||||
}
|
||||
|
||||
// truncateMap 将 map 裁剪到最多 n 条(取前 n 条,无特定顺序)。
|
||||
func truncateMap(m map[string]string, n int) map[string]string {
|
||||
if len(m) <= n {
|
||||
return m
|
||||
@@ -564,6 +617,7 @@ func truncateMap(m map[string]string, n int) map[string]string {
|
||||
return out
|
||||
}
|
||||
|
||||
// min 返回两个整数中的较小值。
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
@@ -571,6 +625,7 @@ func min(a, b int) int {
|
||||
return b
|
||||
}
|
||||
|
||||
// max 返回两个整数中的较大值。
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
@@ -578,7 +633,7 @@ func max(a, b int) int {
|
||||
return b
|
||||
}
|
||||
|
||||
// Expose Stats for monitoring.
|
||||
// GetStats 返回当前爬虫统计快照(用于监控)。
|
||||
func (c *Crawler) GetStats() Stats {
|
||||
return Stats{
|
||||
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
|
||||
|
||||
+86
-53
@@ -1,64 +1,71 @@
|
||||
// Package crawler implements the HTTP fetching layer with robots.txt compliance,
|
||||
// per-host rate limiting, redirect tracking, and encoding detection.
|
||||
// crawler 包负责 HTTP 请求层:遵守 robots.txt、主机限流、追踪重定向、自动检测字符集。
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"fmt" // 字符串格式化(构建 robots.txt URL、错误信息)
|
||||
"io" // IO 接口(读取响应体)
|
||||
"net/http" // HTTP 客户端
|
||||
"net/url" // URL 解析
|
||||
"strings" // 字符串操作
|
||||
"sync" // 互斥锁(保护限流表和 robots.txt 缓存)
|
||||
"time" // 时间(限流间隔计算、robots.txt 缓存过期)
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
"golang.org/x/net/html/charset" // HTML 字符集自动检测(将各种编码转为 UTF-8)
|
||||
)
|
||||
|
||||
// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
|
||||
// ErrCrawl 表示爬取过程中的预期错误(404、被 robots.txt 禁止、非 HTML 类型等)。
|
||||
// 此类错误由 FetchSafe 静默丢弃(返回 nil, nil)。
|
||||
type ErrCrawl struct {
|
||||
Msg string
|
||||
Msg string // 错误描述文本
|
||||
}
|
||||
|
||||
// Error 实现 error 接口,返回错误描述。
|
||||
func (e *ErrCrawl) Error() string { return e.Msg }
|
||||
|
||||
// FetchResult bundles the result of a successful fetch.
|
||||
// FetchResult 封装一次成功抓取的完整结果。
|
||||
type FetchResult struct {
|
||||
Body string // decoded HTML body
|
||||
FinalURL string // URL after redirects
|
||||
Redirects map[string]string // permanent redirects: from → to
|
||||
ServerType string
|
||||
Body string // 解码后的 HTML 正文(UTF-8)
|
||||
FinalURL string // 经过所有重定向后的最终 URL
|
||||
Redirects map[string]string // 永久重定向(301/308)映射:原始 URL → 最终 URL
|
||||
ServerType string // HTTP Server 响应头(如 "nginx/1.18")
|
||||
}
|
||||
|
||||
// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
|
||||
// Fetcher 是一个可复用的 HTTP 客户端,内置 robots.txt 合规检查和按主机限流。
|
||||
type Fetcher struct {
|
||||
client *http.Client
|
||||
userAgent string
|
||||
cooldown time.Duration
|
||||
client *http.Client // HTTP 客户端(包含重定向和超时控制)
|
||||
userAgent string // HTTP 请求的 User-Agent 头
|
||||
cooldown time.Duration // 同一主机相邻两次请求的最小间隔
|
||||
|
||||
rateMu sync.Mutex
|
||||
lastHit map[string]time.Time // host → last request time
|
||||
rateMu sync.Mutex // 保护 lastHit 限流表的互斥锁
|
||||
lastHit map[string]time.Time // 主机名 → 上次请求时间(用于计算限流等待)
|
||||
|
||||
robotsMu sync.Mutex
|
||||
robots map[string]*robotsEntry // host → parsed robots
|
||||
robotsMu sync.Mutex // 保护 robots 缓存的互斥锁
|
||||
robots map[string]*robotsEntry // 主机名 → 该主机的 robots.txt 解析结果(含缓存时间)
|
||||
}
|
||||
|
||||
// robotsEntry 缓存单台主机的 robots.txt 解析结果。
|
||||
type robotsEntry struct {
|
||||
rules []robotsRule
|
||||
fetchedAt time.Time
|
||||
rules []robotsRule // 解析后的规则列表
|
||||
fetchedAt time.Time // 缓存时间(用于判断是否过期,24h 后重新抓取)
|
||||
}
|
||||
|
||||
// robotsRule 一条 robots.txt 规则,对应一个 User-Agent 块。
|
||||
type robotsRule struct {
|
||||
userAgent string
|
||||
disallow []string
|
||||
allow []string
|
||||
userAgent string // 适用的爬虫名称("*" 表示全部)
|
||||
disallow []string // Disallow 路径列表
|
||||
allow []string // Allow 路径列表(优先于 disallow)
|
||||
}
|
||||
|
||||
// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
|
||||
// NewFetcher 创建一个新的 Fetcher 实例。
|
||||
// userAgent:发出的 HTTP 请求的 User-Agent;cooldown:同一主机相邻请求的最小间隔。
|
||||
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
|
||||
return &Fetcher{
|
||||
client: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
Timeout: 30 * time.Second, // 默认单次请求超时 30 秒
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
// 跟随重定向最多 10 次,防止重定向循环
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
@@ -67,34 +74,37 @@ func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
|
||||
},
|
||||
userAgent: userAgent,
|
||||
cooldown: cooldown,
|
||||
lastHit: make(map[string]time.Time),
|
||||
robots: make(map[string]*robotsEntry),
|
||||
lastHit: make(map[string]time.Time), // 限流表初始化
|
||||
robots: make(map[string]*robotsEntry), // robots.txt 缓存初始化
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch fetches url, respecting robots.txt and rate limits.
|
||||
// polite=false skips both checks (used by search server snippet fetcher).
|
||||
// Fetch 抓取指定 URL,遵守 robots.txt 和主机限流。
|
||||
// polite=false 时跳过 robots.txt 检查和限流(用于搜索服务在线抓摘要)。
|
||||
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
||||
}
|
||||
|
||||
// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
|
||||
// FetchSafe 封装 Fetch,在遇到预期爬取错误(404/disallowed/非 HTML)时返回 (nil, nil)。
|
||||
// 调用方无需区分错误类型,直接跳过即可。
|
||||
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
||||
if _, ok := err.(*ErrCrawl); ok {
|
||||
return nil, nil
|
||||
return nil, nil // 预期错误,静默丢弃
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// fetchWithHistory does the actual request and populates redirect history.
|
||||
// fetchWithHistory 执行实际 HTTP 请求,追踪永久重定向。
|
||||
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||
// 解析 URL 提取主机名
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
|
||||
}
|
||||
host := parsed.Host
|
||||
|
||||
// polite 模式:先限流,再检查 robots.txt
|
||||
if polite {
|
||||
f.rateLimit(host)
|
||||
if !f.robotsAllowed(rawURL, host) {
|
||||
@@ -102,6 +112,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
||||
}
|
||||
}
|
||||
|
||||
// 追踪永久重定向(301/308)
|
||||
redirects := make(map[string]string)
|
||||
client := &http.Client{
|
||||
Timeout: timeout,
|
||||
@@ -109,6 +120,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
// 记录永久重定向
|
||||
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
|
||||
from := via[len(via)-1].URL.String()
|
||||
to := req.URL.String()
|
||||
@@ -118,26 +130,32 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
||||
},
|
||||
}
|
||||
|
||||
// 构造 GET 请求
|
||||
req, _ := http.NewRequest("GET", rawURL, nil)
|
||||
req.Header.Set("User-Agent", f.userAgent)
|
||||
|
||||
// 发送请求
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
defer resp.Body.Close() // 读取完毕后关闭响应体
|
||||
|
||||
// 检查 HTTP 状态码
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, &ErrCrawl{Msg: "404 not found"}
|
||||
}
|
||||
if resp.StatusCode >= 400 {
|
||||
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
|
||||
}
|
||||
|
||||
// 检查 Content-Type,必须是 HTML 才继续
|
||||
ct := resp.Header.Get("Content-Type")
|
||||
if !strings.Contains(ct, "text/html") {
|
||||
return nil, &ErrCrawl{Msg: "not html: " + ct}
|
||||
}
|
||||
|
||||
// 解码响应体(自动检测字符集转为 UTF-8)
|
||||
body, err := decodeBody(resp.Body, ct, sizeLimit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
@@ -145,19 +163,20 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
||||
|
||||
return &FetchResult{
|
||||
Body: body,
|
||||
FinalURL: resp.Request.URL.String(),
|
||||
Redirects: redirects,
|
||||
FinalURL: resp.Request.URL.String(), // 重定向后的最终 URL
|
||||
Redirects: redirects,
|
||||
ServerType: resp.Header.Get("Server"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// rateLimit sleeps if the last request to host was too recent.
|
||||
// rateLimit 检查并强制执行主机限流:若距上次请求不足 cooldown 秒则 sleep 等待。
|
||||
func (f *Fetcher) rateLimit(host string) {
|
||||
f.rateMu.Lock()
|
||||
last, ok := f.lastHit[host]
|
||||
now := time.Now()
|
||||
f.lastHit[host] = now
|
||||
// Periodically prune the map
|
||||
|
||||
// 限流表超过 10000 条时清理两倍 cooldown 时间之前的过期项,防止内存泄漏
|
||||
if len(f.lastHit) > 10000 {
|
||||
cutoff := now.Add(-f.cooldown * 2)
|
||||
for k, v := range f.lastHit {
|
||||
@@ -168,6 +187,7 @@ func (f *Fetcher) rateLimit(host string) {
|
||||
}
|
||||
f.rateMu.Unlock()
|
||||
|
||||
// 计算需要等待的时间
|
||||
if ok {
|
||||
elapsed := now.Sub(last)
|
||||
if elapsed < f.cooldown {
|
||||
@@ -176,12 +196,14 @@ func (f *Fetcher) rateLimit(host string) {
|
||||
}
|
||||
}
|
||||
|
||||
// robotsAllowed returns true if rawURL is crawlable.
|
||||
// robotsAllowed 根据 robots.txt 规则判断某 URL 是否允许爬取。
|
||||
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
||||
// 尝试从缓存读取(加锁保护)
|
||||
f.robotsMu.Lock()
|
||||
entry, ok := f.robots[host]
|
||||
f.robotsMu.Unlock()
|
||||
|
||||
// 缓存不存在或已过期(超过 24 小时)则重新抓取并解析
|
||||
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
|
||||
entry = f.fetchRobots(host, rawURL)
|
||||
f.robotsMu.Lock()
|
||||
@@ -189,6 +211,7 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
||||
f.robotsMu.Unlock()
|
||||
}
|
||||
|
||||
// 解析 URL 路径
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return false
|
||||
@@ -198,43 +221,47 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
||||
path = "/"
|
||||
}
|
||||
|
||||
// 遍历所有规则,找到适用的 User-Agent
|
||||
for _, rule := range entry.rules {
|
||||
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
|
||||
continue
|
||||
}
|
||||
// Check allow first (higher priority)
|
||||
// Allow 优先检查(更高优先级)
|
||||
for _, a := range rule.allow {
|
||||
if strings.HasPrefix(path, a) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
// 再检查 Disallow
|
||||
for _, dis := range rule.disallow {
|
||||
if dis != "" && strings.HasPrefix(path, dis) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
return true // 默认允许
|
||||
}
|
||||
|
||||
// fetchRobots downloads and parses robots.txt for a host.
|
||||
// fetchRobots 抓取并解析某主机的 robots.txt 文件。
|
||||
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
|
||||
entry := &robotsEntry{fetchedAt: time.Now()}
|
||||
entry := &robotsEntry{fetchedAt: time.Now()} // 初始化空条目(抓取失败时默认允许全部)
|
||||
scheme := "https"
|
||||
if strings.HasPrefix(exampleURL, "http://") {
|
||||
scheme = "http"
|
||||
}
|
||||
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
||||
|
||||
// robots.txt 单独请求,超时 5 秒
|
||||
client := &http.Client{Timeout: 5 * time.Second}
|
||||
req, _ := http.NewRequest("GET", robotsURL, nil)
|
||||
req.Header.Set("User-Agent", f.userAgent)
|
||||
resp, err := client.Do(req)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return entry // allow all if robots.txt unavailable
|
||||
return entry // robots.txt 不可用时默认允许爬取
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
// 最多读取 256KB(大部分 robots.txt 远小于此大小)
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
|
||||
if err != nil {
|
||||
return entry
|
||||
@@ -243,16 +270,19 @@ func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
|
||||
return entry
|
||||
}
|
||||
|
||||
// parseRobots is a minimal robots.txt parser.
|
||||
// parseRobots 最小化 robots.txt 解析器。
|
||||
// 支持 User-agent、Disallow、Allow 三种指令,忽略注释和空行。
|
||||
func parseRobots(content string) []robotsRule {
|
||||
var rules []robotsRule
|
||||
var current *robotsRule
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
// 去除行内注释
|
||||
if idx := strings.Index(line, "#"); idx >= 0 {
|
||||
line = line[:idx]
|
||||
}
|
||||
if line == "" {
|
||||
// 空行结束当前块
|
||||
if current != nil {
|
||||
rules = append(rules, *current)
|
||||
current = nil
|
||||
@@ -267,6 +297,7 @@ func parseRobots(content string) []robotsRule {
|
||||
val := strings.TrimSpace(parts[1])
|
||||
switch key {
|
||||
case "user-agent":
|
||||
// 新建一个 User-Agent 块
|
||||
if current == nil {
|
||||
current = &robotsRule{userAgent: val}
|
||||
} else {
|
||||
@@ -282,23 +313,25 @@ func parseRobots(content string) []robotsRule {
|
||||
}
|
||||
}
|
||||
}
|
||||
// 最后一个块
|
||||
if current != nil {
|
||||
rules = append(rules, *current)
|
||||
}
|
||||
return rules
|
||||
}
|
||||
|
||||
// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
|
||||
// decodeBody 从响应体读取最多 sizeLimit 字节,自动检测字符集并转为 UTF-8 字符串。
|
||||
// sizeLimit <= 0 时不限制大小。
|
||||
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
|
||||
var reader io.Reader = r
|
||||
if sizeLimit > 0 {
|
||||
reader = io.LimitReader(r, int64(sizeLimit))
|
||||
reader = io.LimitReader(r, int64(sizeLimit)) // 限制读取字节数,防止大文件撑爆内存
|
||||
}
|
||||
|
||||
// Use golang.org/x/net/html/charset for auto-detection
|
||||
// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
|
||||
utf8Reader, err := charset.NewReader(reader, contentType)
|
||||
if err != nil {
|
||||
// Fall back to reading raw and hoping for UTF-8
|
||||
// 备选方案:直接以 UTF-8 读取(可能乱码但不崩溃)
|
||||
data, readErr := io.ReadAll(reader)
|
||||
if readErr != nil {
|
||||
return "", readErr
|
||||
|
||||
Reference in New Issue
Block a user