From 8e4cdaca47aa0814a83ffe2c5260c5371c0e3630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E6=96=87=E5=B3=B0?= Date: Fri, 10 Apr 2026 13:22:35 +0800 Subject: [PATCH] up --- crawler/crawler.go | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 8d8e136..59a1ce6 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -45,8 +45,8 @@ const ( // Priority Worker 配置 const ( - priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers) - priorityQueueSize = 100 // Priority 任务队列缓冲大小 + priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers) + priorityQueueSize = 100 // Priority 任务队列缓冲大小 ) // Crawler 编排整个 BFS 爬取流程。 @@ -70,10 +70,10 @@ type Crawler struct { activeWorkers int64 // ---- Priority Worker(独立 goroutine,不受主 workers 限制)---- - priorityCh chan string // Priority URL 任务队列 - prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers) - priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束 - priorityMu sync.RWMutex // 保护 priorityStats + priorityCh chan string // Priority URL 任务队列 + prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers) + priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束 + priorityMu sync.RWMutex // 保护 priorityStats priorityStats struct { pending int64 // 待处理的 Priority URL 数量(入队但未开始) active int64 // 正在处理的 Priority URL 数量 @@ -113,12 +113,12 @@ func GlobalPriorityStatus() map[string]interface{} { // prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。 func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler { c := &Crawler{ - fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second), - db: db, - analyzer: a, - prosperMap: prosperMap, - visited: make(map[string]bool), - priorityCh: make(chan string, priorityQueueSize), + fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second), + db: db, + analyzer: a, + prosperMap: prosperMap, + visited: make(map[string]bool), + priorityCh: make(chan string, priorityQueueSize), prioritySem: make(chan struct{}, priorityMaxWorkers), } // 启动 Priority Worker(独立 goroutine,不受主 workers 限制) @@ -425,7 +425,7 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) { oldEntry, _ := c.db.GetSnippet(res.FinalURL) if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash { isRecrawl = true - log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL) + //log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL) } // 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)