优化优先连接逻辑

2026-04-10 14:28:04 +08:00
parent 8e4cdaca47
commit 71f74dd85a
3 changed files with 80 additions and 17 deletions
@@ -78,6 +78,10 @@ type Crawler struct {
 		pending int64 // 待处理的 Priority URL 数量（入队但未开始）
 		active  int64 // 正在处理的 Priority URL 数量
 	}
 	// ---- Priority 子链接优先队列（来自 priority worker 的子链接会优先爬取）----
 	priorityChildrenMu sync.Mutex
 	priorityChildren   []string // Priority URL 产生的子链接（优先处理）
 }
 // 全局活跃线程计数器（跨包可读，无需持有 Crawler 引用）
@@ -103,12 +107,26 @@ var globalPriorityStatus struct {
 // GlobalPriorityStatus 返回当前全局 Priority Worker 状态。
 func GlobalPriorityStatus() map[string]interface{} {
 	return map[string]interface{}{
-		"pending":     atomic.LoadInt64(&globalPriorityStatus.pending),
+		"pending":         atomic.LoadInt64(&globalPriorityStatus.pending),
-		"active":      atomic.LoadInt64(&globalPriorityStatus.active),
+		"active":          atomic.LoadInt64(&globalPriorityStatus.active),
-		"max_workers": priorityMaxWorkers,
+		"max_workers":     priorityMaxWorkers,
 		"children_queue":  atomic.LoadInt64(&globalPriorityChildren),
 	}
 }
 // 全局 Priority 子链接队列长度（跨 Crawler 实例共享）
 var globalPriorityChildren int64
 // IncrementPriorityChildren 增加 priorityChildren 计数。
 func IncrementPriorityChildren(n int64) {
 	atomic.AddInt64(&globalPriorityChildren, n)
 }
 // DecrementPriorityChildren 减少 priorityChildren 计数。
 func DecrementPriorityChildren(n int64) {
 	atomic.AddInt64(&globalPriorityChildren, -n)
 }
 // New 创建一个 Crawler 实例。
 // prosperMap 由 info 模块加载，传入域名繁荣值用于调度优先级计算。
 func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
@@ -225,7 +243,18 @@ func (c *Crawler) runPriorityWorker() {
 			// 直接调用 visitURL，绕过队列调度
 			hrefs := c.visitURL(rawURL)
-			// 收集的子链接正常进入 BFS 队列（由调用方处理，这里只负责爬取本身）
+			// 将子链接加入优先队列（保持优先级）
 			if len(hrefs) > 0 {
 				c.priorityChildrenMu.Lock()
 				for _, child := range hrefs {
 					if !c.isVisited(child) {
 						c.priorityChildren = append(c.priorityChildren, child)
 					}
 				}
 				IncrementPriorityChildren(int64(len(hrefs)))
 				c.priorityChildrenMu.Unlock()
 			}
 			log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))
 			// 清理已访问的 priority URL（防止重复爬取）
@@ -312,6 +341,17 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 		// 每轮 epoch 从 config 读取最新 workers 值，支持运行时动态调整
 		workers := config.CrawlerWorkers()
 		// ---- 优先处理 priorityChildren 队列（来自 priority worker 的子链接）----
 		var priorityQueue []string
 		c.priorityChildrenMu.Lock()
 		if len(c.priorityChildren) > 0 {
 			priorityQueue = c.priorityChildren
 			// 更新全局计数器：这些 URL 即将被处理
 			DecrementPriorityChildren(int64(len(priorityQueue)))
 			log.Printf("[crawler] epoch %d/%d  processing %d priority children first", ep+1, maxEpoch, len(priorityQueue))
 		}
 		c.priorityChildrenMu.Unlock()
 		// 每轮开始前：拉取 priority URLs，插入队列前端
 		priorityAdded := c.fetchAndApplyPriorityURLs(&queue)
 		if priorityAdded > 0 {
@@ -324,7 +364,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 			c.markVisited(u)
 		}
-		// 并发抓取本轮所有 URL
+		// ---- 并发抓取本轮所有 URL ----
 		var (
 			newLinks []URLWeight // 收集下一轮候选 URL
 			mu       sync.Mutex  // 保护 newLinks 的并发写入
@@ -333,6 +373,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 		// 信号量：限制同时并发数（使用上方读取的 workers 值）
 		sem := make(chan struct{}, workers)
 		for _, u := range queue {
 			wg.Add(1)
 			sem <- struct{}{} // 获取一个令牌（阻塞直到有空闲槽位）
@@ -347,21 +388,43 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 				// 抓取单个 URL，返回发现的子链接
 				hrefs := c.visitURL(rawURL)
 				n := len(hrefs)
-				if n > 0 {
+				if n == 0 {
-					// 每个子链接分得 1/n 的父页面权重
+					return
 					w := 1.0 / float64(n)
 					mu.Lock()
 					for _, h := range hrefs {
 						if !c.isVisited(h) {
 							newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
 						}
 					}
 					mu.Unlock()
 				}
 				// 收集未访问的子链接
 				var children []string
 				for _, h := range hrefs {
 					if !c.isVisited(h) {
 						children = append(children, h)
 					}
 				}
 				if len(children) == 0 {
 					return
 				}
 				// 分配权重
 				w := 1.0 / float64(n)
 				// 孙链接（来自 priorityChildren）爬取后，子链接进入正常 BFS 队列（不再优先传递）
 				// 所有子链接统一进入 newLinks，经过 schedule() 调度
 				mu.Lock()
 				for _, h := range children {
 					newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
 				}
 				mu.Unlock()
 			}(u)
 		}
 		wg.Wait()
 		// ---- 清空本轮已处理的 priorityQueue ----
 		c.priorityChildrenMu.Lock()
 		if len(priorityQueue) > 0 {
 			c.priorityChildren = c.priorityChildren[len(priorityQueue):]
 			// 计数器已在提取时扣除，这里不需要额外操作
 		}
 		c.priorityChildrenMu.Unlock()
 		// 本轮没有发现新链接，爬取结束
 		if len(newLinks) == 0 {
 			log.Println("[crawler] empty queue — stopping")
@@ -5,7 +5,7 @@
    <link rel="icon" type="image/svg+xml" href="/vite.svg" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
    <title>SESE 爬取管理</title>
-    <script type="module" crossorigin src="/assets/index-ClaCiNQl.js"></script>
+    <script type="module" crossorigin src="/assets/index-CiQvUT0P.js"></script>
    <link rel="stylesheet" crossorigin href="/assets/index-Dr22_wUg.css">
  </head>
  <body>