增加爬取状态api

This commit is contained in:
2026-04-10 18:40:40 +08:00
parent 65e6547d54
commit fd827cbde3
7 changed files with 104 additions and 7 deletions
+64
View File
@@ -82,6 +82,20 @@ type Crawler struct {
// ---- Priority 子链接优先队列(来自 priority worker 的子链接会优先爬取)----
priorityChildrenMu sync.Mutex
priorityChildren []string // Priority URL 产生的子链接(优先处理)
// ---- 爬取状态暴露(供前端监控) ----
crawlStatusMu sync.RWMutex
crawlStatus CrawlStatus // 当前轮次状态
}
// CrawlStatus 暴露给前端监控的爬取状态
type CrawlStatus struct {
CurrentEpoch int `json:"current_epoch"` // 当前轮次(从1开始)
MaxEpoch int `json:"max_epoch"` // 总轮数上限
QueueLength int `json:"queue_length"` // 本轮队列长度
CompletedCount int `json:"completed_count"` // 本轮已完成的 URL 数
VisitedTotal int `json:"visited_total"` // 已收录 URL 总数
IsRunning bool `json:"is_running"` // 是否正在运行
}
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
@@ -336,6 +350,12 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
c.markVisited(entryURL)
queue := []string{entryURL}
// 初始化爬取状态
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.MaxEpoch = maxEpoch
cs.IsRunning = true
})
// 启动后台重爬定时器:定期释放过期 URL 到候选池
c.startRecrawlTicker()
@@ -343,6 +363,13 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
workers := config.CrawlerWorkers()
// 更新爬取状态:新一轮开始
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.CurrentEpoch = ep + 1
cs.QueueLength = len(queue)
cs.CompletedCount = 0
})
// ---- 优先处理 priorityChildren 队列(来自 priority worker 的子链接)----
var priorityQueue []string
c.priorityChildrenMu.Lock()
@@ -371,6 +398,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
newLinks []URLWeight // 收集下一轮候选 URL
mu sync.Mutex // 保护 newLinks 的并发写入
wg sync.WaitGroup
completed int64 // 本轮已完成的计数(atomic
)
// 信号量:限制同时并发数(使用上方读取的 workers 值)
@@ -389,6 +417,13 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
// 抓取单个 URL,返回发现的子链接
hrefs := c.visitURL(rawURL)
// 更新完成计数
currentCompleted := atomic.AddInt64(&completed, 1)
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.CompletedCount = int(currentCompleted)
})
n := len(hrefs)
if n == 0 {
return
@@ -427,15 +462,30 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
}
c.priorityChildrenMu.Unlock()
// 更新已收录总数
c.visitedMu.RLock()
visitedTotal := len(c.visited)
c.visitedMu.RUnlock()
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.VisitedTotal = visitedTotal
})
// 本轮没有发现新链接,爬取结束
if len(newLinks) == 0 {
log.Println("[crawler] empty queue — stopping")
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.IsRunning = false
})
return
}
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
queue = c.schedule(newLinks)
}
// 所有轮次完成,更新状态
c.updateCrawlStatus(func(cs *CrawlStatus) {
cs.IsRunning = false
})
}
// visitURLRaw 抓取 URL 的核心逻辑,提取标题、描述、正文、子链接。
@@ -1089,3 +1139,17 @@ func (c *Crawler) GetStats() Stats {
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
}
}
// GetCrawlStatus 返回当前爬取状态(供前端监控)。
func (c *Crawler) GetCrawlStatus() CrawlStatus {
c.crawlStatusMu.RLock()
defer c.crawlStatusMu.RUnlock()
return c.crawlStatus
}
// updateCrawlStatus 更新爬取状态(内部使用)。
func (c *Crawler) updateCrawlStatus(fn func(*CrawlStatus)) {
c.crawlStatusMu.Lock()
defer c.crawlStatusMu.Unlock()
fn(&c.crawlStatus)
}
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+2 -2
View File
@@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>SESE 爬取管理</title>
<script type="module" crossorigin src="/assets/index-UuqaCCT8.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-Dr22_wUg.css">
<script type="module" crossorigin src="/assets/index-Bk_Z2Kbw.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-vFVFq14O.css">
</head>
<body>
<div id="app"></div>
+34 -1
View File
@@ -66,6 +66,7 @@ type Server struct {
// crawler 爬虫实例(用于 Priority URL 添加时立即触发爬取)
crawler interface {
TriggerPriorityCrawl(url string)
GetCrawlStatus() crawler.CrawlStatus
}
}
@@ -95,9 +96,10 @@ func (s *Server) SetBacklinkRunner(r interface {
s.backlinkRunner = r
}
// SetCrawler 注入爬虫实例(用于 Priority URL 添加时立即触发爬取)。
// SetCrawler 注入爬虫实例(用于 Priority URL 添加时立即触发爬取和状态查询)。
func (s *Server) SetCrawler(c interface {
TriggerPriorityCrawl(url string)
GetCrawlStatus() crawler.CrawlStatus
}) {
s.crawler = c
}
@@ -132,6 +134,7 @@ func (s *Server) Handler() http.Handler {
mux.HandleFunc("/admin/pending", s.handleAdminPending)
mux.HandleFunc("/admin/workers", s.handleAdminWorkers)
mux.HandleFunc("/admin/backlink", s.handleAdminBacklink)
mux.HandleFunc("/admin/crawl/status", s.handleAdminCrawlStatus)
// 静态文件(SPA fallback
mux.Handle("/", spaHandler{dist: "dist"})
return mux
@@ -705,6 +708,36 @@ func (s *Server) handleAdminBacklink(w http.ResponseWriter, r *http.Request) {
}
}
// handleAdminCrawlStatus 返回爬虫爬取状态。
// GET: 返回当前轮次、总轮数、队列长度、已完成数、已收录总数、是否运行中
func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
if r.Method != http.MethodGet && r.Method != http.MethodOptions {
http.Error(w, `{"error":"method not allowed"}`, 405)
return
}
if r.Method == http.MethodOptions {
w.WriteHeader(204)
return
}
if s.crawler == nil {
json.NewEncoder(w).Encode(crawler.CrawlStatus{
CurrentEpoch: 0,
MaxEpoch: 0,
QueueLength: 0,
CompletedCount: 0,
VisitedTotal: 0,
IsRunning: false,
})
return
}
json.NewEncoder(w).Encode(s.crawler.GetCrawlStatus())
}
// ---- 搜索处理器 ----
// searchResponse 是搜索 API 的 JSON 响应结构。