优化优先连接逻辑
This commit is contained in:
+78
-15
@@ -78,6 +78,10 @@ type Crawler struct {
|
|||||||
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
||||||
active int64 // 正在处理的 Priority URL 数量
|
active int64 // 正在处理的 Priority URL 数量
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ---- Priority 子链接优先队列(来自 priority worker 的子链接会优先爬取)----
|
||||||
|
priorityChildrenMu sync.Mutex
|
||||||
|
priorityChildren []string // Priority URL 产生的子链接(优先处理)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
|
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
|
||||||
@@ -103,12 +107,26 @@ var globalPriorityStatus struct {
|
|||||||
// GlobalPriorityStatus 返回当前全局 Priority Worker 状态。
|
// GlobalPriorityStatus 返回当前全局 Priority Worker 状态。
|
||||||
func GlobalPriorityStatus() map[string]interface{} {
|
func GlobalPriorityStatus() map[string]interface{} {
|
||||||
return map[string]interface{}{
|
return map[string]interface{}{
|
||||||
"pending": atomic.LoadInt64(&globalPriorityStatus.pending),
|
"pending": atomic.LoadInt64(&globalPriorityStatus.pending),
|
||||||
"active": atomic.LoadInt64(&globalPriorityStatus.active),
|
"active": atomic.LoadInt64(&globalPriorityStatus.active),
|
||||||
"max_workers": priorityMaxWorkers,
|
"max_workers": priorityMaxWorkers,
|
||||||
|
"children_queue": atomic.LoadInt64(&globalPriorityChildren),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 全局 Priority 子链接队列长度(跨 Crawler 实例共享)
|
||||||
|
var globalPriorityChildren int64
|
||||||
|
|
||||||
|
// IncrementPriorityChildren 增加 priorityChildren 计数。
|
||||||
|
func IncrementPriorityChildren(n int64) {
|
||||||
|
atomic.AddInt64(&globalPriorityChildren, n)
|
||||||
|
}
|
||||||
|
|
||||||
|
// DecrementPriorityChildren 减少 priorityChildren 计数。
|
||||||
|
func DecrementPriorityChildren(n int64) {
|
||||||
|
atomic.AddInt64(&globalPriorityChildren, -n)
|
||||||
|
}
|
||||||
|
|
||||||
// New 创建一个 Crawler 实例。
|
// New 创建一个 Crawler 实例。
|
||||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||||
@@ -225,7 +243,18 @@ func (c *Crawler) runPriorityWorker() {
|
|||||||
// 直接调用 visitURL,绕过队列调度
|
// 直接调用 visitURL,绕过队列调度
|
||||||
hrefs := c.visitURL(rawURL)
|
hrefs := c.visitURL(rawURL)
|
||||||
|
|
||||||
// 收集的子链接正常进入 BFS 队列(由调用方处理,这里只负责爬取本身)
|
// 将子链接加入优先队列(保持优先级)
|
||||||
|
if len(hrefs) > 0 {
|
||||||
|
c.priorityChildrenMu.Lock()
|
||||||
|
for _, child := range hrefs {
|
||||||
|
if !c.isVisited(child) {
|
||||||
|
c.priorityChildren = append(c.priorityChildren, child)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
IncrementPriorityChildren(int64(len(hrefs)))
|
||||||
|
c.priorityChildrenMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))
|
log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))
|
||||||
|
|
||||||
// 清理已访问的 priority URL(防止重复爬取)
|
// 清理已访问的 priority URL(防止重复爬取)
|
||||||
@@ -312,6 +341,17 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
|
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
|
||||||
workers := config.CrawlerWorkers()
|
workers := config.CrawlerWorkers()
|
||||||
|
|
||||||
|
// ---- 优先处理 priorityChildren 队列(来自 priority worker 的子链接)----
|
||||||
|
var priorityQueue []string
|
||||||
|
c.priorityChildrenMu.Lock()
|
||||||
|
if len(c.priorityChildren) > 0 {
|
||||||
|
priorityQueue = c.priorityChildren
|
||||||
|
// 更新全局计数器:这些 URL 即将被处理
|
||||||
|
DecrementPriorityChildren(int64(len(priorityQueue)))
|
||||||
|
log.Printf("[crawler] epoch %d/%d processing %d priority children first", ep+1, maxEpoch, len(priorityQueue))
|
||||||
|
}
|
||||||
|
c.priorityChildrenMu.Unlock()
|
||||||
|
|
||||||
// 每轮开始前:拉取 priority URLs,插入队列前端
|
// 每轮开始前:拉取 priority URLs,插入队列前端
|
||||||
priorityAdded := c.fetchAndApplyPriorityURLs(&queue)
|
priorityAdded := c.fetchAndApplyPriorityURLs(&queue)
|
||||||
if priorityAdded > 0 {
|
if priorityAdded > 0 {
|
||||||
@@ -324,7 +364,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
c.markVisited(u)
|
c.markVisited(u)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 并发抓取本轮所有 URL
|
// ---- 并发抓取本轮所有 URL ----
|
||||||
var (
|
var (
|
||||||
newLinks []URLWeight // 收集下一轮候选 URL
|
newLinks []URLWeight // 收集下一轮候选 URL
|
||||||
mu sync.Mutex // 保护 newLinks 的并发写入
|
mu sync.Mutex // 保护 newLinks 的并发写入
|
||||||
@@ -333,6 +373,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
|
|
||||||
// 信号量:限制同时并发数(使用上方读取的 workers 值)
|
// 信号量:限制同时并发数(使用上方读取的 workers 值)
|
||||||
sem := make(chan struct{}, workers)
|
sem := make(chan struct{}, workers)
|
||||||
|
|
||||||
for _, u := range queue {
|
for _, u := range queue {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||||
@@ -347,21 +388,43 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
// 抓取单个 URL,返回发现的子链接
|
// 抓取单个 URL,返回发现的子链接
|
||||||
hrefs := c.visitURL(rawURL)
|
hrefs := c.visitURL(rawURL)
|
||||||
n := len(hrefs)
|
n := len(hrefs)
|
||||||
if n > 0 {
|
if n == 0 {
|
||||||
// 每个子链接分得 1/n 的父页面权重
|
return
|
||||||
w := 1.0 / float64(n)
|
|
||||||
mu.Lock()
|
|
||||||
for _, h := range hrefs {
|
|
||||||
if !c.isVisited(h) {
|
|
||||||
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
mu.Unlock()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 收集未访问的子链接
|
||||||
|
var children []string
|
||||||
|
for _, h := range hrefs {
|
||||||
|
if !c.isVisited(h) {
|
||||||
|
children = append(children, h)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(children) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// 分配权重
|
||||||
|
w := 1.0 / float64(n)
|
||||||
|
|
||||||
|
// 孙链接(来自 priorityChildren)爬取后,子链接进入正常 BFS 队列(不再优先传递)
|
||||||
|
// 所有子链接统一进入 newLinks,经过 schedule() 调度
|
||||||
|
mu.Lock()
|
||||||
|
for _, h := range children {
|
||||||
|
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
|
||||||
|
}
|
||||||
|
mu.Unlock()
|
||||||
}(u)
|
}(u)
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
|
// ---- 清空本轮已处理的 priorityQueue ----
|
||||||
|
c.priorityChildrenMu.Lock()
|
||||||
|
if len(priorityQueue) > 0 {
|
||||||
|
c.priorityChildren = c.priorityChildren[len(priorityQueue):]
|
||||||
|
// 计数器已在提取时扣除,这里不需要额外操作
|
||||||
|
}
|
||||||
|
c.priorityChildrenMu.Unlock()
|
||||||
|
|
||||||
// 本轮没有发现新链接,爬取结束
|
// 本轮没有发现新链接,爬取结束
|
||||||
if len(newLinks) == 0 {
|
if len(newLinks) == 0 {
|
||||||
log.Println("[crawler] empty queue — stopping")
|
log.Println("[crawler] empty queue — stopping")
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Vendored
+1
-1
@@ -5,7 +5,7 @@
|
|||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>SESE 爬取管理</title>
|
<title>SESE 爬取管理</title>
|
||||||
<script type="module" crossorigin src="/assets/index-ClaCiNQl.js"></script>
|
<script type="module" crossorigin src="/assets/index-CiQvUT0P.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-Dr22_wUg.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-Dr22_wUg.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
|||||||
Reference in New Issue
Block a user