优化优先连接逻辑
This commit is contained in:
+78
-15
@@ -78,6 +78,10 @@ type Crawler struct {
|
||||
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
||||
active int64 // 正在处理的 Priority URL 数量
|
||||
}
|
||||
|
||||
// ---- Priority 子链接优先队列(来自 priority worker 的子链接会优先爬取)----
|
||||
priorityChildrenMu sync.Mutex
|
||||
priorityChildren []string // Priority URL 产生的子链接(优先处理)
|
||||
}
|
||||
|
||||
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
|
||||
@@ -103,12 +107,26 @@ var globalPriorityStatus struct {
|
||||
// GlobalPriorityStatus 返回当前全局 Priority Worker 状态。
|
||||
func GlobalPriorityStatus() map[string]interface{} {
|
||||
return map[string]interface{}{
|
||||
"pending": atomic.LoadInt64(&globalPriorityStatus.pending),
|
||||
"active": atomic.LoadInt64(&globalPriorityStatus.active),
|
||||
"max_workers": priorityMaxWorkers,
|
||||
"pending": atomic.LoadInt64(&globalPriorityStatus.pending),
|
||||
"active": atomic.LoadInt64(&globalPriorityStatus.active),
|
||||
"max_workers": priorityMaxWorkers,
|
||||
"children_queue": atomic.LoadInt64(&globalPriorityChildren),
|
||||
}
|
||||
}
|
||||
|
||||
// 全局 Priority 子链接队列长度(跨 Crawler 实例共享)
|
||||
var globalPriorityChildren int64
|
||||
|
||||
// IncrementPriorityChildren 增加 priorityChildren 计数。
|
||||
func IncrementPriorityChildren(n int64) {
|
||||
atomic.AddInt64(&globalPriorityChildren, n)
|
||||
}
|
||||
|
||||
// DecrementPriorityChildren 减少 priorityChildren 计数。
|
||||
func DecrementPriorityChildren(n int64) {
|
||||
atomic.AddInt64(&globalPriorityChildren, -n)
|
||||
}
|
||||
|
||||
// New 创建一个 Crawler 实例。
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
@@ -225,7 +243,18 @@ func (c *Crawler) runPriorityWorker() {
|
||||
// 直接调用 visitURL,绕过队列调度
|
||||
hrefs := c.visitURL(rawURL)
|
||||
|
||||
// 收集的子链接正常进入 BFS 队列(由调用方处理,这里只负责爬取本身)
|
||||
// 将子链接加入优先队列(保持优先级)
|
||||
if len(hrefs) > 0 {
|
||||
c.priorityChildrenMu.Lock()
|
||||
for _, child := range hrefs {
|
||||
if !c.isVisited(child) {
|
||||
c.priorityChildren = append(c.priorityChildren, child)
|
||||
}
|
||||
}
|
||||
IncrementPriorityChildren(int64(len(hrefs)))
|
||||
c.priorityChildrenMu.Unlock()
|
||||
}
|
||||
|
||||
log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))
|
||||
|
||||
// 清理已访问的 priority URL(防止重复爬取)
|
||||
@@ -312,6 +341,17 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
|
||||
workers := config.CrawlerWorkers()
|
||||
|
||||
// ---- 优先处理 priorityChildren 队列(来自 priority worker 的子链接)----
|
||||
var priorityQueue []string
|
||||
c.priorityChildrenMu.Lock()
|
||||
if len(c.priorityChildren) > 0 {
|
||||
priorityQueue = c.priorityChildren
|
||||
// 更新全局计数器:这些 URL 即将被处理
|
||||
DecrementPriorityChildren(int64(len(priorityQueue)))
|
||||
log.Printf("[crawler] epoch %d/%d processing %d priority children first", ep+1, maxEpoch, len(priorityQueue))
|
||||
}
|
||||
c.priorityChildrenMu.Unlock()
|
||||
|
||||
// 每轮开始前:拉取 priority URLs,插入队列前端
|
||||
priorityAdded := c.fetchAndApplyPriorityURLs(&queue)
|
||||
if priorityAdded > 0 {
|
||||
@@ -324,7 +364,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
c.markVisited(u)
|
||||
}
|
||||
|
||||
// 并发抓取本轮所有 URL
|
||||
// ---- 并发抓取本轮所有 URL ----
|
||||
var (
|
||||
newLinks []URLWeight // 收集下一轮候选 URL
|
||||
mu sync.Mutex // 保护 newLinks 的并发写入
|
||||
@@ -333,6 +373,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
|
||||
// 信号量:限制同时并发数(使用上方读取的 workers 值)
|
||||
sem := make(chan struct{}, workers)
|
||||
|
||||
for _, u := range queue {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||
@@ -347,21 +388,43 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
// 抓取单个 URL,返回发现的子链接
|
||||
hrefs := c.visitURL(rawURL)
|
||||
n := len(hrefs)
|
||||
if n > 0 {
|
||||
// 每个子链接分得 1/n 的父页面权重
|
||||
w := 1.0 / float64(n)
|
||||
mu.Lock()
|
||||
for _, h := range hrefs {
|
||||
if !c.isVisited(h) {
|
||||
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
if n == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// 收集未访问的子链接
|
||||
var children []string
|
||||
for _, h := range hrefs {
|
||||
if !c.isVisited(h) {
|
||||
children = append(children, h)
|
||||
}
|
||||
}
|
||||
if len(children) == 0 {
|
||||
return
|
||||
}
|
||||
|
||||
// 分配权重
|
||||
w := 1.0 / float64(n)
|
||||
|
||||
// 孙链接(来自 priorityChildren)爬取后,子链接进入正常 BFS 队列(不再优先传递)
|
||||
// 所有子链接统一进入 newLinks,经过 schedule() 调度
|
||||
mu.Lock()
|
||||
for _, h := range children {
|
||||
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
|
||||
}
|
||||
mu.Unlock()
|
||||
}(u)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// ---- 清空本轮已处理的 priorityQueue ----
|
||||
c.priorityChildrenMu.Lock()
|
||||
if len(priorityQueue) > 0 {
|
||||
c.priorityChildren = c.priorityChildren[len(priorityQueue):]
|
||||
// 计数器已在提取时扣除,这里不需要额外操作
|
||||
}
|
||||
c.priorityChildrenMu.Unlock()
|
||||
|
||||
// 本轮没有发现新链接,爬取结束
|
||||
if len(newLinks) == 0 {
|
||||
log.Println("[crawler] empty queue — stopping")
|
||||
|
||||
Reference in New Issue
Block a user