优化手动增加的逻辑

This commit is contained in:
2026-04-11 21:38:04 +08:00
parent 0ce381b8a7
commit 5abe9271fe
7 changed files with 57 additions and 29 deletions
+49 -22
View File
@@ -73,15 +73,14 @@ type Crawler struct {
priorityCh chan string // Priority URL 任务队列(用户手动添加) priorityCh chan string // Priority URL 任务队列(用户手动添加)
priorityChildCh chan string // Priority 子链接队列(子 URL 继续由 priority worker 爬取) priorityChildCh chan string // Priority 子链接队列(子 URL 继续由 priority worker 爬取)
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束 priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
priorityMu sync.RWMutex // 保护 priorityStats priorityMu sync.RWMutex // 保护 priorityStats 和 priorityChildLinks
priorityStats struct { priorityStats struct {
pending int64 // 待处理的 Priority URL 数量(入队但未开始) pending int64 // 待处理的 Priority URL 数量(入队但未开始)
active int64 // 正在处理的 Priority URL 数量 active int64 // 正在处理的 Priority URL 数量
} }
// 孙链接(子 URL 的子链接)进入普通 BFS 队列 // 孙链接(子 URL 的子链接)进入普通 BFS 队列
normalChildCh chan URLWeight // 孙链接 channel,由 Run 循环消费 normalChildCh chan URLWeight // 孙链接 channel,由 Run 循环消费
// ---- 爬取状态暴露(供前端监控) ---- // ---- 爬取状态暴露(供前端监控) ----
crawlStatusMu sync.RWMutex crawlStatusMu sync.RWMutex
crawlStatus CrawlStatus // 当前轮次状态 crawlStatus CrawlStatus // 当前轮次状态
@@ -94,6 +93,7 @@ type CrawlStatus struct {
QueueLength int `json:"queue_length"` // 本轮队列长度 QueueLength int `json:"queue_length"` // 本轮队列长度
CompletedCount int `json:"completed_count"` // 本轮已完成的 URL 数 CompletedCount int `json:"completed_count"` // 本轮已完成的 URL 数
VisitedTotal int `json:"visited_total"` // 已收录 URL 总数 VisitedTotal int `json:"visited_total"` // 已收录 URL 总数
NextPoolSize int `json:"next_pool_size"` // 下一轮链接池大小(newLinks 调度后的队列长度)
IsRunning bool `json:"is_running"` // 是否正在运行 IsRunning bool `json:"is_running"` // 是否正在运行
} }
@@ -157,15 +157,15 @@ func DecrementPriorityLevel2Inflight(n int64) {
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。 // prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler { func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
c := &Crawler{ c := &Crawler{
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second), fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
db: db, db: db,
analyzer: a, analyzer: a,
prosperMap: prosperMap, prosperMap: prosperMap,
visited: make(map[string]bool), visited: make(map[string]bool),
priorityCh: make(chan string, priorityQueueSize), priorityCh: make(chan string, priorityQueueSize),
priorityChildCh: make(chan string, priorityQueueSize), priorityChildCh: make(chan string, priorityQueueSize),
prioritySem: make(chan struct{}, priorityMaxWorkers), prioritySem: make(chan struct{}, priorityMaxWorkers),
normalChildCh: make(chan URLWeight, priorityQueueSize), normalChildCh: make(chan URLWeight, 50000), // 孙链接 channel,大 buffer 避免丢
} }
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制) // 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
go c.runPriorityWorker() go c.runPriorityWorker()
@@ -320,8 +320,8 @@ func (c *Crawler) runPriorityWorker() {
// priorityCrawlLoop 爬取单个 URL: // priorityCrawlLoop 爬取单个 URL:
// //
// level=1(一级,手动 URL):visitURLUnlimited 无限爬子链接 → 二级队列 // level=1(一级,手动 URL):visitURLUnlimited 无限爬子链接 → 二级队列priorityChildCh
// level=2(二级,子 URL):visitURLUnlimited 无限爬子链接 → 普通 BFS // level=2(二级,子 URL):visitURLUnlimited 无限爬子链接 → 孙链接(normalChildCh
func (c *Crawler) priorityCrawlLoop(rawURL string, level int) { func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
defer c.priorityWg.Done() defer c.priorityWg.Done()
defer func() { <-c.prioritySem }() defer func() { <-c.prioritySem }()
@@ -341,13 +341,13 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
// 两级都不限制子链接数量 // 两级都不限制子链接数量
children := c.visitURLUnlimited(rawURL) children := c.visitURLUnlimited(rawURL)
log.Printf("[crawler] priority[%d] crawl done: %s (%d child links)", level, rawURL, len(children)) //log.Printf("[crawler] priority[%d] crawl done: %s (%d child links)", level, rawURL, len(children))
if len(children) == 0 { if len(children) == 0 {
return return
} }
// 一级:子链接进二级队列;二级:子链接进普通 BFS // 一级:子链接进二级队列;二级:子链接直接加入 newLinks(同步)
for _, child := range children { for _, child := range children {
if level == 1 { if level == 1 {
select { select {
@@ -357,9 +357,11 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
// 二级队列满,丢弃 // 二级队列满,丢弃
} }
} else { } else {
// 二级:子链接进孙链接 channel,由 Run 永久 drain 直到全部到达
select { select {
case c.normalChildCh <- URLWeight{URL: child, Weight: 1.0}: case c.normalChildCh <- URLWeight{URL: child, Weight: 1.0}:
default: default:
// channel 满(很少发生),孙链接丢弃
} }
} }
} }
@@ -368,6 +370,12 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
// TriggerPriorityCrawl 立即触发高优先级爬取(突破 workers 上限)。 // TriggerPriorityCrawl 立即触发高优先级爬取(突破 workers 上限)。
// 适合用户手动插入 URL 时立即响应。 // 适合用户手动插入 URL 时立即响应。
func (c *Crawler) TriggerPriorityCrawl(url string) { func (c *Crawler) TriggerPriorityCrawl(url string) {
defer func() {
if r := recover(); r != nil {
// priorityCh 已关闭(Run 已退出),忽略
log.Printf("[crawler] priority crawl ignored (crawler stopped): %s", url)
}
}()
select { select {
case c.priorityCh <- url: case c.priorityCh <- url:
c.priorityMu.Lock() c.priorityMu.Lock()
@@ -529,9 +537,9 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
} }
wg.Wait() wg.Wait()
// 消费孙链接 channel(孙链接来自 priority 爬取的子链接,进入普通 BFS 队列) // drain 孙链接 channel(孙链接来自 priority 爬取的子链接,进入普通 BFS 队列)
// 孙链接爬取是异步的,使用 timeout 确保全部到达后再调度 // runPriorityWorker 发送完毕后等待足够时间,确保 channel 中最后几条到达
timeout := time.After(5 * time.Second) timeout := time.After(10 * time.Second)
drained := false drained := false
for !drained { for !drained {
select { select {
@@ -550,17 +558,36 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
cs.VisitedTotal = visitedTotal cs.VisitedTotal = visitedTotal
}) })
// 本轮没有发现新链接,爬取结束 // 本轮没有发现新链接,报告停止,等待新 Priority URL 触发重启
if len(newLinks) == 0 { if len(newLinks) == 0 {
log.Println("[crawler] empty queue — stopping") log.Println("[crawler] empty — stopped, waiting for new URLs")
c.updateCrawlStatus(func(cs *CrawlStatus) { c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.IsRunning = false cs.IsRunning = false
}) })
return // 空循环等 normalChildCh,新数据到达后立即从 epoch 0 重新开始
for {
gc, ok := <-c.normalChildCh
if !ok {
return
}
newLinks = append(newLinks, gc)
log.Printf("[crawler] new URLs detected, restarting from epoch 0 (%d in pool)", len(newLinks))
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.IsRunning = true
})
ep = -1 // continue 后 ep++ 变成 0
break // 退出空循环,进入正常队列处理
}
} }
// 调度算法:从候选 URL 中选出下一轮要抓取的队列 // 调度算法:从候选 URL 中选出下一轮要抓取的队列
nextPoolSize := len(newLinks)
queue = c.schedule(newLinks) queue = c.schedule(newLinks)
// 更新下一轮链接池大小
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.NextPoolSize = nextPoolSize
})
} }
// 所有轮次完成,更新状态 // 所有轮次完成,更新状态
c.updateCrawlStatus(func(cs *CrawlStatus) { c.updateCrawlStatus(func(cs *CrawlStatus) {
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+2 -2
View File
@@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="/vite.svg" /> <link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>SESE 爬取管理</title> <title>SESE 爬取管理</title>
<script type="module" crossorigin src="/assets/index-5m9a_huA.js"></script> <script type="module" crossorigin src="/assets/index-yf_Ps55i.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-7XdVcZh0.css"> <link rel="stylesheet" crossorigin href="/assets/index-DNzRL3Ws.css">
</head> </head>
<body> <body>
<div id="app"></div> <div id="app"></div>
+1
View File
@@ -812,6 +812,7 @@ func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request)
QueueLength: 0, QueueLength: 0,
CompletedCount: 0, CompletedCount: 0,
VisitedTotal: 0, VisitedTotal: 0,
NextPoolSize: 0,
IsRunning: false, IsRunning: false,
}) })
return return