This commit is contained in:
2026-04-10 13:22:35 +08:00
parent 22d0b72ee5
commit 8e4cdaca47
+13 -13
View File
@@ -45,8 +45,8 @@ const (
// Priority Worker 配置 // Priority Worker 配置
const ( const (
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers
priorityQueueSize = 100 // Priority 任务队列缓冲大小 priorityQueueSize = 100 // Priority 任务队列缓冲大小
) )
// Crawler 编排整个 BFS 爬取流程。 // Crawler 编排整个 BFS 爬取流程。
@@ -70,10 +70,10 @@ type Crawler struct {
activeWorkers int64 activeWorkers int64
// ---- Priority Worker(独立 goroutine,不受主 workers 限制)---- // ---- Priority Worker(独立 goroutine,不受主 workers 限制)----
priorityCh chan string // Priority URL 任务队列 priorityCh chan string // Priority URL 任务队列
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束 priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
priorityMu sync.RWMutex // 保护 priorityStats priorityMu sync.RWMutex // 保护 priorityStats
priorityStats struct { priorityStats struct {
pending int64 // 待处理的 Priority URL 数量(入队但未开始) pending int64 // 待处理的 Priority URL 数量(入队但未开始)
active int64 // 正在处理的 Priority URL 数量 active int64 // 正在处理的 Priority URL 数量
@@ -113,12 +113,12 @@ func GlobalPriorityStatus() map[string]interface{} {
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。 // prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler { func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
c := &Crawler{ c := &Crawler{
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second), fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
db: db, db: db,
analyzer: a, analyzer: a,
prosperMap: prosperMap, prosperMap: prosperMap,
visited: make(map[string]bool), visited: make(map[string]bool),
priorityCh: make(chan string, priorityQueueSize), priorityCh: make(chan string, priorityQueueSize),
prioritySem: make(chan struct{}, priorityMaxWorkers), prioritySem: make(chan struct{}, priorityMaxWorkers),
} }
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制) // 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
@@ -425,7 +425,7 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
oldEntry, _ := c.db.GetSnippet(res.FinalURL) oldEntry, _ := c.db.GetSnippet(res.FinalURL)
if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash { if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
isRecrawl = true isRecrawl = true
log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL) //log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
} }
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间) // 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)