up
This commit is contained in:
+13
-13
@@ -45,8 +45,8 @@ const (
|
||||
|
||||
// Priority Worker 配置
|
||||
const (
|
||||
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers)
|
||||
priorityQueueSize = 100 // Priority 任务队列缓冲大小
|
||||
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers)
|
||||
priorityQueueSize = 100 // Priority 任务队列缓冲大小
|
||||
)
|
||||
|
||||
// Crawler 编排整个 BFS 爬取流程。
|
||||
@@ -70,10 +70,10 @@ type Crawler struct {
|
||||
activeWorkers int64
|
||||
|
||||
// ---- Priority Worker(独立 goroutine,不受主 workers 限制)----
|
||||
priorityCh chan string // Priority URL 任务队列
|
||||
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers)
|
||||
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
|
||||
priorityMu sync.RWMutex // 保护 priorityStats
|
||||
priorityCh chan string // Priority URL 任务队列
|
||||
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers)
|
||||
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
|
||||
priorityMu sync.RWMutex // 保护 priorityStats
|
||||
priorityStats struct {
|
||||
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
||||
active int64 // 正在处理的 Priority URL 数量
|
||||
@@ -113,12 +113,12 @@ func GlobalPriorityStatus() map[string]interface{} {
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
c := &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||
db: db,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
visited: make(map[string]bool),
|
||||
priorityCh: make(chan string, priorityQueueSize),
|
||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||
db: db,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
visited: make(map[string]bool),
|
||||
priorityCh: make(chan string, priorityQueueSize),
|
||||
prioritySem: make(chan struct{}, priorityMaxWorkers),
|
||||
}
|
||||
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
|
||||
@@ -425,7 +425,7 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
|
||||
if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
|
||||
isRecrawl = true
|
||||
log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
||||
//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
||||
}
|
||||
|
||||
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||
|
||||
Reference in New Issue
Block a user