up

2026-04-09 00:14:55 +08:00
parent 223177b1dd
commit 439d0c1cb6
9 changed files with 88 additions and 46 deletions
@@ -3,24 +3,24 @@
 package crawler

 import (
-	"bytes"          // 字节缓冲（构造 HTTP POST 请求体）
-	"context"        // context 超时控制
-	"encoding/json"  // JSON 序列化（发送关键词数据到收获服务）
-	"fmt"            // 格式化（构造目标地址）
-	"log"            // 日志输出
-	"math"           // 数学运算（指数衰减、质量评分）
-	"math/rand"      // 随机数（加权采样、队列打乱）
-	"net/http"       // HTTP 客户端（POST 数据到收获服务）
-	"net/url"        // URL 解析
-	"strings"        // 字符串操作
-	"sync"           // 互斥锁（保护并发收集结果）
-	"sync/atomic"    // 原子操作（计数器，无锁并发更新）
-	"time"           // 时间戳
+	"bytes"         // 字节缓冲（构造 HTTP POST 请求体）
+	"context"       // context 超时控制
+	"encoding/json" // JSON 序列化（发送关键词数据到收获服务）
+	"fmt"           // 格式化（构造目标地址）
+	"log"           // 日志输出
+	"math"          // 数学运算（指数衰减、质量评分）
+	"math/rand"     // 随机数（加权采样、队列打乱）
+	"net/http"      // HTTP 客户端（POST 数据到收获服务）
+	"net/url"       // URL 解析
+	"strings"       // 字符串操作
+	"sync"          // 互斥锁（保护并发收集结果）
+	"sync/atomic"   // 原子操作（计数器，无锁并发更新）
+	"time"          // 时间戳

 	"sese-engine/analyzer" // 文本分析和关键词提取
-	"sese-engine/config"    // 全局配置常量
-	"sese-engine/parser"    // HTML 解析（提取标题、正文、链接）
-	"sese-engine/storage"   // 持久化存储
+	"sese-engine/config"   // 全局配置常量
+	"sese-engine/parser"   // HTML 解析（提取标题、正文、链接）
+	"sese-engine/storage"  // 持久化存储
 )

 // Stats 存放爬虫实时统计计数器（使用 atomic 原子读取）。
@@ -39,7 +39,7 @@ const (

 const (
 	circuitFailureThreshold = 5  // 连续失败多少次后触发熔断
-	circuitCooldownSeconds = 30 // 熔断持续时间（秒）
+	circuitCooldownSeconds  = 30 // 熔断持续时间（秒）
 )

 // Crawler 编排整个 BFS 爬取流程。
@@ -51,9 +51,9 @@ type Crawler struct {
 	stats      Stats              // 原子计数器

 	// 熔断器（全用 atomic，无 mutex，无慢 I/O 时持有锁的风险）
-	circuitState    int32  // circuitClosed | circuitOpen | circuitHalfOpen
-	circuitFailures int32  // 连续失败计数（atomic）
-	circuitExpiry  int64  // 熔断/半开截止 Unix 时间戳（秒）
+	circuitState    int32 // circuitClosed | circuitOpen | circuitHalfOpen
+	circuitFailures int32 // 连续失败计数（atomic）
+	circuitExpiry   int64 // 熔断/半开截止 Unix 时间戳（秒）
 }

 // New 创建一个 Crawler 实例。
@@ -92,8 +92,8 @@ func (c *Crawler) fetchAndApplyPriorityURLs(visited map[string]bool, queue *[]st

 // URLWeight 将 URL 和发现权重打包在一起，用于调度决策。
 type URLWeight struct {
-	URL    string   // 待访问的 URL
-	Weight float64  // 发现权重（从父页面分得的"关注度"，页面链接越多则每个分得越少）
+	URL    string  // 待访问的 URL
+	Weight float64 // 发现权重（从父页面分得的"关注度"，页面链接越多则每个分得越少）
 }

 // Run 启动 BFS 爬取，从 entryURL 开始，执行最多 maxEpoch 轮。
@@ -399,8 +399,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
 		if failures >= circuitFailureThreshold {
 			atomic.StoreInt32(&c.circuitState, circuitOpen)
 			atomic.StoreInt64(&c.circuitExpiry, now+int64(circuitCooldownSeconds))
-			log.Printf("[crawler] circuit OPEN: harvest endpoint unreachable (%d failures), cooling for %ds",
-				failures, circuitCooldownSeconds)
+			//log.Printf("[crawler] circuit OPEN: harvest endpoint unreachable (%d failures), cooling for %ds",failures, circuitCooldownSeconds)
 		}
 		return
 	}
@@ -482,7 +481,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 	}
 	// 根据目标繁荣占比计算普通 URL 应保留数量
 	expectedProsperRatio := config.ExpectedProsperRatio()
-	n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio)
+	n := int(float64(len(prosperURLs)) * (1 - expectedProsperRatio) / expectedProsperRatio)
 	if len(otherURLs) > n {
 		keep := max(len(otherURLs)-len(selected)/10, n)
 		if keep < len(otherURLs) {