优化优先连接逻辑

2026-04-10 14:28:04 +08:00
parent 8e4cdaca47
commit 71f74dd85a
3 changed files with 80 additions and 17 deletions
@@ -78,6 +78,10 @@ type Crawler struct {
 		pending int64 // 待处理的 Priority URL 数量（入队但未开始）
 		active  int64 // 正在处理的 Priority URL 数量
 	}
+
+	// ---- Priority 子链接优先队列（来自 priority worker 的子链接会优先爬取）----
+	priorityChildrenMu sync.Mutex
+	priorityChildren   []string // Priority URL 产生的子链接（优先处理）
 }

 // 全局活跃线程计数器（跨包可读，无需持有 Crawler 引用）
@@ -103,12 +107,26 @@ var globalPriorityStatus struct {
 // GlobalPriorityStatus 返回当前全局 Priority Worker 状态。
 func GlobalPriorityStatus() map[string]interface{} {
 	return map[string]interface{}{
-		"pending":     atomic.LoadInt64(&globalPriorityStatus.pending),
-		"active":      atomic.LoadInt64(&globalPriorityStatus.active),
-		"max_workers": priorityMaxWorkers,
+		"pending":         atomic.LoadInt64(&globalPriorityStatus.pending),
+		"active":          atomic.LoadInt64(&globalPriorityStatus.active),
+		"max_workers":     priorityMaxWorkers,
+		"children_queue":  atomic.LoadInt64(&globalPriorityChildren),
 	}
 }

+// 全局 Priority 子链接队列长度（跨 Crawler 实例共享）
+var globalPriorityChildren int64
+
+// IncrementPriorityChildren 增加 priorityChildren 计数。
+func IncrementPriorityChildren(n int64) {
+	atomic.AddInt64(&globalPriorityChildren, n)
+}
+
+// DecrementPriorityChildren 减少 priorityChildren 计数。
+func DecrementPriorityChildren(n int64) {
+	atomic.AddInt64(&globalPriorityChildren, -n)
+}
+
 // New 创建一个 Crawler 实例。
 // prosperMap 由 info 模块加载，传入域名繁荣值用于调度优先级计算。
 func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
@@ -225,7 +243,18 @@ func (c *Crawler) runPriorityWorker() {
 			// 直接调用 visitURL，绕过队列调度
 			hrefs := c.visitURL(rawURL)

-			// 收集的子链接正常进入 BFS 队列（由调用方处理，这里只负责爬取本身）
+			// 将子链接加入优先队列（保持优先级）
+			if len(hrefs) > 0 {
+				c.priorityChildrenMu.Lock()
+				for _, child := range hrefs {
+					if !c.isVisited(child) {
+						c.priorityChildren = append(c.priorityChildren, child)
+					}
+				}
+				IncrementPriorityChildren(int64(len(hrefs)))
+				c.priorityChildrenMu.Unlock()
+			}
+
 			log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))

 			// 清理已访问的 priority URL（防止重复爬取）
@@ -312,6 +341,17 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 		// 每轮 epoch 从 config 读取最新 workers 值，支持运行时动态调整
 		workers := config.CrawlerWorkers()

+		// ---- 优先处理 priorityChildren 队列（来自 priority worker 的子链接）----
+		var priorityQueue []string
+		c.priorityChildrenMu.Lock()
+		if len(c.priorityChildren) > 0 {
+			priorityQueue = c.priorityChildren
+			// 更新全局计数器：这些 URL 即将被处理
+			DecrementPriorityChildren(int64(len(priorityQueue)))
+			log.Printf("[crawler] epoch %d/%d  processing %d priority children first", ep+1, maxEpoch, len(priorityQueue))
+		}
+		c.priorityChildrenMu.Unlock()
+
 		// 每轮开始前：拉取 priority URLs，插入队列前端
 		priorityAdded := c.fetchAndApplyPriorityURLs(&queue)
 		if priorityAdded > 0 {
@@ -324,7 +364,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 			c.markVisited(u)
 		}

-		// 并发抓取本轮所有 URL
+		// ---- 并发抓取本轮所有 URL ----
 		var (
 			newLinks []URLWeight // 收集下一轮候选 URL
 			mu       sync.Mutex  // 保护 newLinks 的并发写入
@@ -333,6 +373,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {

 		// 信号量：限制同时并发数（使用上方读取的 workers 值）
 		sem := make(chan struct{}, workers)
+
 		for _, u := range queue {
 			wg.Add(1)
 			sem <- struct{}{} // 获取一个令牌（阻塞直到有空闲槽位）
@@ -347,21 +388,43 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 				// 抓取单个 URL，返回发现的子链接
 				hrefs := c.visitURL(rawURL)
 				n := len(hrefs)
-				if n > 0 {
-					// 每个子链接分得 1/n 的父页面权重
-					w := 1.0 / float64(n)
-					mu.Lock()
-					for _, h := range hrefs {
-						if !c.isVisited(h) {
-							newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
-						}
-					}
-					mu.Unlock()
+				if n == 0 {
+					return
 				}
+
+				// 收集未访问的子链接
+				var children []string
+				for _, h := range hrefs {
+					if !c.isVisited(h) {
+						children = append(children, h)
+					}
+				}
+				if len(children) == 0 {
+					return
+				}
+
+				// 分配权重
+				w := 1.0 / float64(n)
+
+				// 孙链接（来自 priorityChildren）爬取后，子链接进入正常 BFS 队列（不再优先传递）
+				// 所有子链接统一进入 newLinks，经过 schedule() 调度
+				mu.Lock()
+				for _, h := range children {
+					newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
+				}
+				mu.Unlock()
 			}(u)
 		}
 		wg.Wait()

+		// ---- 清空本轮已处理的 priorityQueue ----
+		c.priorityChildrenMu.Lock()
+		if len(priorityQueue) > 0 {
+			c.priorityChildren = c.priorityChildren[len(priorityQueue):]
+			// 计数器已在提取时扣除，这里不需要额外操作
+		}
+		c.priorityChildrenMu.Unlock()
+
 		// 本轮没有发现新链接，爬取结束
 		if len(newLinks) == 0 {
 			log.Println("[crawler] empty queue — stopping")