This commit is contained in:
2026-04-10 13:22:35 +08:00
parent 22d0b72ee5
commit 8e4cdaca47
+13 -13
View File
@@ -45,8 +45,8 @@ const (
// Priority Worker 配置
const (
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers
priorityQueueSize = 100 // Priority 任务队列缓冲大小
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers
priorityQueueSize = 100 // Priority 任务队列缓冲大小
)
// Crawler 编排整个 BFS 爬取流程。
@@ -70,10 +70,10 @@ type Crawler struct {
activeWorkers int64
// ---- Priority Worker(独立 goroutine,不受主 workers 限制)----
priorityCh chan string // Priority URL 任务队列
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
priorityMu sync.RWMutex // 保护 priorityStats
priorityCh chan string // Priority URL 任务队列
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
priorityMu sync.RWMutex // 保护 priorityStats
priorityStats struct {
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
active int64 // 正在处理的 Priority URL 数量
@@ -113,12 +113,12 @@ func GlobalPriorityStatus() map[string]interface{} {
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
c := &Crawler{
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
db: db,
analyzer: a,
prosperMap: prosperMap,
visited: make(map[string]bool),
priorityCh: make(chan string, priorityQueueSize),
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
db: db,
analyzer: a,
prosperMap: prosperMap,
visited: make(map[string]bool),
priorityCh: make(chan string, priorityQueueSize),
prioritySem: make(chan struct{}, priorityMaxWorkers),
}
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
@@ -425,7 +425,7 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
isRecrawl = true
log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
}
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)