优化手动增加的逻辑
This commit is contained in:
+49
-22
@@ -73,15 +73,14 @@ type Crawler struct {
|
|||||||
priorityCh chan string // Priority URL 任务队列(用户手动添加)
|
priorityCh chan string // Priority URL 任务队列(用户手动添加)
|
||||||
priorityChildCh chan string // Priority 子链接队列(子 URL 继续由 priority worker 爬取)
|
priorityChildCh chan string // Priority 子链接队列(子 URL 继续由 priority worker 爬取)
|
||||||
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers)
|
prioritySem chan struct{} // Priority 信号量(上限 priorityMaxWorkers)
|
||||||
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
|
priorityWg sync.WaitGroup // 等待所有 Priority goroutine 结束
|
||||||
priorityMu sync.RWMutex // 保护 priorityStats
|
priorityMu sync.RWMutex // 保护 priorityStats 和 priorityChildLinks
|
||||||
priorityStats struct {
|
priorityStats struct {
|
||||||
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
pending int64 // 待处理的 Priority URL 数量(入队但未开始)
|
||||||
active int64 // 正在处理的 Priority URL 数量
|
active int64 // 正在处理的 Priority URL 数量
|
||||||
}
|
}
|
||||||
// 孙链接(子 URL 的子链接)进入普通 BFS 队列
|
// 孙链接(子 URL 的子链接)进入普通 BFS 队列
|
||||||
normalChildCh chan URLWeight // 孙链接 channel,由 Run 循环消费
|
normalChildCh chan URLWeight // 孙链接 channel,由 Run 循环消费
|
||||||
|
|
||||||
// ---- 爬取状态暴露(供前端监控) ----
|
// ---- 爬取状态暴露(供前端监控) ----
|
||||||
crawlStatusMu sync.RWMutex
|
crawlStatusMu sync.RWMutex
|
||||||
crawlStatus CrawlStatus // 当前轮次状态
|
crawlStatus CrawlStatus // 当前轮次状态
|
||||||
@@ -94,6 +93,7 @@ type CrawlStatus struct {
|
|||||||
QueueLength int `json:"queue_length"` // 本轮队列长度
|
QueueLength int `json:"queue_length"` // 本轮队列长度
|
||||||
CompletedCount int `json:"completed_count"` // 本轮已完成的 URL 数
|
CompletedCount int `json:"completed_count"` // 本轮已完成的 URL 数
|
||||||
VisitedTotal int `json:"visited_total"` // 已收录 URL 总数
|
VisitedTotal int `json:"visited_total"` // 已收录 URL 总数
|
||||||
|
NextPoolSize int `json:"next_pool_size"` // 下一轮链接池大小(newLinks 调度后的队列长度)
|
||||||
IsRunning bool `json:"is_running"` // 是否正在运行
|
IsRunning bool `json:"is_running"` // 是否正在运行
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -157,15 +157,15 @@ func DecrementPriorityLevel2Inflight(n int64) {
|
|||||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||||
c := &Crawler{
|
c := &Crawler{
|
||||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||||
db: db,
|
db: db,
|
||||||
analyzer: a,
|
analyzer: a,
|
||||||
prosperMap: prosperMap,
|
prosperMap: prosperMap,
|
||||||
visited: make(map[string]bool),
|
visited: make(map[string]bool),
|
||||||
priorityCh: make(chan string, priorityQueueSize),
|
priorityCh: make(chan string, priorityQueueSize),
|
||||||
priorityChildCh: make(chan string, priorityQueueSize),
|
priorityChildCh: make(chan string, priorityQueueSize),
|
||||||
prioritySem: make(chan struct{}, priorityMaxWorkers),
|
prioritySem: make(chan struct{}, priorityMaxWorkers),
|
||||||
normalChildCh: make(chan URLWeight, priorityQueueSize),
|
normalChildCh: make(chan URLWeight, 50000), // 孙链接 channel,大 buffer 避免丢
|
||||||
}
|
}
|
||||||
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
|
// 启动 Priority Worker(独立 goroutine,不受主 workers 限制)
|
||||||
go c.runPriorityWorker()
|
go c.runPriorityWorker()
|
||||||
@@ -320,8 +320,8 @@ func (c *Crawler) runPriorityWorker() {
|
|||||||
|
|
||||||
// priorityCrawlLoop 爬取单个 URL:
|
// priorityCrawlLoop 爬取单个 URL:
|
||||||
//
|
//
|
||||||
// level=1(一级,手动 URL):visitURLUnlimited 无限爬子链接 → 二级队列
|
// level=1(一级,手动 URL):visitURLUnlimited 无限爬子链接 → 二级队列(priorityChildCh)
|
||||||
// level=2(二级,子 URL):visitURLUnlimited 无限爬子链接 → 普通 BFS
|
// level=2(二级,子 URL):visitURLUnlimited 无限爬子链接 → 孙链接(normalChildCh)
|
||||||
func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
||||||
defer c.priorityWg.Done()
|
defer c.priorityWg.Done()
|
||||||
defer func() { <-c.prioritySem }()
|
defer func() { <-c.prioritySem }()
|
||||||
@@ -341,13 +341,13 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
|||||||
// 两级都不限制子链接数量
|
// 两级都不限制子链接数量
|
||||||
children := c.visitURLUnlimited(rawURL)
|
children := c.visitURLUnlimited(rawURL)
|
||||||
|
|
||||||
log.Printf("[crawler] priority[%d] crawl done: %s (%d child links)", level, rawURL, len(children))
|
//log.Printf("[crawler] priority[%d] crawl done: %s (%d child links)", level, rawURL, len(children))
|
||||||
|
|
||||||
if len(children) == 0 {
|
if len(children) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 一级:子链接进二级队列;二级:子链接进普通 BFS
|
// 一级:子链接进二级队列;二级:子链接直接加入 newLinks(同步)
|
||||||
for _, child := range children {
|
for _, child := range children {
|
||||||
if level == 1 {
|
if level == 1 {
|
||||||
select {
|
select {
|
||||||
@@ -357,9 +357,11 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
|||||||
// 二级队列满,丢弃
|
// 二级队列满,丢弃
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
// 二级:子链接进孙链接 channel,由 Run 永久 drain 直到全部到达
|
||||||
select {
|
select {
|
||||||
case c.normalChildCh <- URLWeight{URL: child, Weight: 1.0}:
|
case c.normalChildCh <- URLWeight{URL: child, Weight: 1.0}:
|
||||||
default:
|
default:
|
||||||
|
// channel 满(很少发生),孙链接丢弃
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -368,6 +370,12 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
|||||||
// TriggerPriorityCrawl 立即触发高优先级爬取(突破 workers 上限)。
|
// TriggerPriorityCrawl 立即触发高优先级爬取(突破 workers 上限)。
|
||||||
// 适合用户手动插入 URL 时立即响应。
|
// 适合用户手动插入 URL 时立即响应。
|
||||||
func (c *Crawler) TriggerPriorityCrawl(url string) {
|
func (c *Crawler) TriggerPriorityCrawl(url string) {
|
||||||
|
defer func() {
|
||||||
|
if r := recover(); r != nil {
|
||||||
|
// priorityCh 已关闭(Run 已退出),忽略
|
||||||
|
log.Printf("[crawler] priority crawl ignored (crawler stopped): %s", url)
|
||||||
|
}
|
||||||
|
}()
|
||||||
select {
|
select {
|
||||||
case c.priorityCh <- url:
|
case c.priorityCh <- url:
|
||||||
c.priorityMu.Lock()
|
c.priorityMu.Lock()
|
||||||
@@ -529,9 +537,9 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
// 消费孙链接 channel(孙链接来自 priority 爬取的子链接,进入普通 BFS 队列)
|
// drain 孙链接 channel(孙链接来自 priority 爬取的子链接,进入普通 BFS 队列)
|
||||||
// 孙链接爬取是异步的,使用 timeout 确保全部到达后再调度
|
// runPriorityWorker 发送完毕后等待足够时间,确保 channel 中最后几条到达
|
||||||
timeout := time.After(5 * time.Second)
|
timeout := time.After(10 * time.Second)
|
||||||
drained := false
|
drained := false
|
||||||
for !drained {
|
for !drained {
|
||||||
select {
|
select {
|
||||||
@@ -550,17 +558,36 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
cs.VisitedTotal = visitedTotal
|
cs.VisitedTotal = visitedTotal
|
||||||
})
|
})
|
||||||
|
|
||||||
// 本轮没有发现新链接,爬取结束
|
// 本轮没有发现新链接,报告停止,等待新 Priority URL 触发重启
|
||||||
if len(newLinks) == 0 {
|
if len(newLinks) == 0 {
|
||||||
log.Println("[crawler] empty queue — stopping")
|
log.Println("[crawler] empty — stopped, waiting for new URLs")
|
||||||
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
cs.IsRunning = false
|
cs.IsRunning = false
|
||||||
})
|
})
|
||||||
return
|
// 空循环等 normalChildCh,新数据到达后立即从 epoch 0 重新开始
|
||||||
|
for {
|
||||||
|
gc, ok := <-c.normalChildCh
|
||||||
|
if !ok {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
newLinks = append(newLinks, gc)
|
||||||
|
log.Printf("[crawler] new URLs detected, restarting from epoch 0 (%d in pool)", len(newLinks))
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.IsRunning = true
|
||||||
|
})
|
||||||
|
ep = -1 // continue 后 ep++ 变成 0
|
||||||
|
break // 退出空循环,进入正常队列处理
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
|
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
|
||||||
|
nextPoolSize := len(newLinks)
|
||||||
queue = c.schedule(newLinks)
|
queue = c.schedule(newLinks)
|
||||||
|
|
||||||
|
// 更新下一轮链接池大小
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.NextPoolSize = nextPoolSize
|
||||||
|
})
|
||||||
}
|
}
|
||||||
// 所有轮次完成,更新状态
|
// 所有轮次完成,更新状态
|
||||||
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
|||||||
Vendored
-2
File diff suppressed because one or more lines are too long
Vendored
+2
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Vendored
+2
-2
@@ -5,8 +5,8 @@
|
|||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>SESE 爬取管理</title>
|
<title>SESE 爬取管理</title>
|
||||||
<script type="module" crossorigin src="/assets/index-5m9a_huA.js"></script>
|
<script type="module" crossorigin src="/assets/index-yf_Ps55i.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-7XdVcZh0.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-DNzRL3Ws.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="app"></div>
|
<div id="app"></div>
|
||||||
|
|||||||
@@ -812,6 +812,7 @@ func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request)
|
|||||||
QueueLength: 0,
|
QueueLength: 0,
|
||||||
CompletedCount: 0,
|
CompletedCount: 0,
|
||||||
VisitedTotal: 0,
|
VisitedTotal: 0,
|
||||||
|
NextPoolSize: 0,
|
||||||
IsRunning: false,
|
IsRunning: false,
|
||||||
})
|
})
|
||||||
return
|
return
|
||||||
|
|||||||
+1
-1
Submodule sese-engine-ui updated: 7ba2011ead...63c40d69d4
Reference in New Issue
Block a user