up
This commit is contained in:
+13
-13
@@ -45,8 +45,8 @@ const (
|
|||||||
|
|
||||||
// Priority Worker 配置
|
// Priority Worker 配置
|
||||||
const (
|
const (
|
||||||
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers)
|
priorityMaxWorkers = 50 // Priority 独立 goroutine 上限(突破主 workers)
|
||||||
priorityQueueSize = 100 // Priority 任务队列缓冲大小
|
priorityQueueSize = 100 // Priority 任务队列缓冲大小
|
||||||
)
|
)
|
||||||
|
|
||||||
// Crawler 编排整个 BFS 爬取流程。
|
// Crawler 编排整个 BFS 爬取流程。
|
||||||
@@ -70,10 +70,10 @@ type Crawler struct {
|
|||||||
activeWorkers int64
|
activeWorkers int64
|
||||||
|
|
||||||
// ---- Priority Worker(独立 goroutine,不受主 workers 限制)----
|
// ---- Priority Worker(独立 goroutine,不受主 workers 限制)----
|
||||||
priorityCh chan string // Priority URL 任务队列
|
priorityCh chan string // Priority URL 任务队列
|
||||||
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers)
|
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers)
|
||||||
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
|
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
|
||||||
priorityMu sync.RWMutex // 保护 priorityStats
|
priorityMu sync.RWMutex // 保护 priorityStats
|
||||||
priorityStats struct {
|
priorityStats struct {
|
||||||
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
||||||
active int64 // 正在处理的 Priority URL 数量
|
active int64 // 正在处理的 Priority URL 数量
|
||||||
@@ -113,12 +113,12 @@ func GlobalPriorityStatus() map[string]interface{} {
|
|||||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||||
c := &Crawler{
|
c := &Crawler{
|
||||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||||
db: db,
|
db: db,
|
||||||
analyzer: a,
|
analyzer: a,
|
||||||
prosperMap: prosperMap,
|
prosperMap: prosperMap,
|
||||||
visited: make(map[string]bool),
|
visited: make(map[string]bool),
|
||||||
priorityCh: make(chan string, priorityQueueSize),
|
priorityCh: make(chan string, priorityQueueSize),
|
||||||
prioritySem: make(chan struct{}, priorityMaxWorkers),
|
prioritySem: make(chan struct{}, priorityMaxWorkers),
|
||||||
}
|
}
|
||||||
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
|
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
|
||||||
@@ -425,7 +425,7 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
|||||||
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
|
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
|
||||||
if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
|
if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
|
||||||
isRecrawl = true
|
isRecrawl = true
|
||||||
log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||||
|
|||||||
Reference in New Issue
Block a user