增加爬取状态api
This commit is contained in:
@@ -82,6 +82,20 @@ type Crawler struct {
|
|||||||
// ---- Priority 子链接优先队列(来自 priority worker 的子链接会优先爬取)----
|
// ---- Priority 子链接优先队列(来自 priority worker 的子链接会优先爬取)----
|
||||||
priorityChildrenMu sync.Mutex
|
priorityChildrenMu sync.Mutex
|
||||||
priorityChildren []string // Priority URL 产生的子链接(优先处理)
|
priorityChildren []string // Priority URL 产生的子链接(优先处理)
|
||||||
|
|
||||||
|
// ---- 爬取状态暴露(供前端监控) ----
|
||||||
|
crawlStatusMu sync.RWMutex
|
||||||
|
crawlStatus CrawlStatus // 当前轮次状态
|
||||||
|
}
|
||||||
|
|
||||||
|
// CrawlStatus 暴露给前端监控的爬取状态
|
||||||
|
type CrawlStatus struct {
|
||||||
|
CurrentEpoch int `json:"current_epoch"` // 当前轮次(从1开始)
|
||||||
|
MaxEpoch int `json:"max_epoch"` // 总轮数上限
|
||||||
|
QueueLength int `json:"queue_length"` // 本轮队列长度
|
||||||
|
CompletedCount int `json:"completed_count"` // 本轮已完成的 URL 数
|
||||||
|
VisitedTotal int `json:"visited_total"` // 已收录 URL 总数
|
||||||
|
IsRunning bool `json:"is_running"` // 是否正在运行
|
||||||
}
|
}
|
||||||
|
|
||||||
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
|
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
|
||||||
@@ -336,6 +350,12 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
c.markVisited(entryURL)
|
c.markVisited(entryURL)
|
||||||
queue := []string{entryURL}
|
queue := []string{entryURL}
|
||||||
|
|
||||||
|
// 初始化爬取状态
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.MaxEpoch = maxEpoch
|
||||||
|
cs.IsRunning = true
|
||||||
|
})
|
||||||
|
|
||||||
// 启动后台重爬定时器:定期释放过期 URL 到候选池
|
// 启动后台重爬定时器:定期释放过期 URL 到候选池
|
||||||
c.startRecrawlTicker()
|
c.startRecrawlTicker()
|
||||||
|
|
||||||
@@ -343,6 +363,13 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
|
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
|
||||||
workers := config.CrawlerWorkers()
|
workers := config.CrawlerWorkers()
|
||||||
|
|
||||||
|
// 更新爬取状态:新一轮开始
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.CurrentEpoch = ep + 1
|
||||||
|
cs.QueueLength = len(queue)
|
||||||
|
cs.CompletedCount = 0
|
||||||
|
})
|
||||||
|
|
||||||
// ---- 优先处理 priorityChildren 队列(来自 priority worker 的子链接)----
|
// ---- 优先处理 priorityChildren 队列(来自 priority worker 的子链接)----
|
||||||
var priorityQueue []string
|
var priorityQueue []string
|
||||||
c.priorityChildrenMu.Lock()
|
c.priorityChildrenMu.Lock()
|
||||||
@@ -371,6 +398,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
newLinks []URLWeight // 收集下一轮候选 URL
|
newLinks []URLWeight // 收集下一轮候选 URL
|
||||||
mu sync.Mutex // 保护 newLinks 的并发写入
|
mu sync.Mutex // 保护 newLinks 的并发写入
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
|
completed int64 // 本轮已完成的计数(atomic)
|
||||||
)
|
)
|
||||||
|
|
||||||
// 信号量:限制同时并发数(使用上方读取的 workers 值)
|
// 信号量:限制同时并发数(使用上方读取的 workers 值)
|
||||||
@@ -389,6 +417,13 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
|
|
||||||
// 抓取单个 URL,返回发现的子链接
|
// 抓取单个 URL,返回发现的子链接
|
||||||
hrefs := c.visitURL(rawURL)
|
hrefs := c.visitURL(rawURL)
|
||||||
|
|
||||||
|
// 更新完成计数
|
||||||
|
currentCompleted := atomic.AddInt64(&completed, 1)
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.CompletedCount = int(currentCompleted)
|
||||||
|
})
|
||||||
|
|
||||||
n := len(hrefs)
|
n := len(hrefs)
|
||||||
if n == 0 {
|
if n == 0 {
|
||||||
return
|
return
|
||||||
@@ -427,15 +462,30 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
}
|
}
|
||||||
c.priorityChildrenMu.Unlock()
|
c.priorityChildrenMu.Unlock()
|
||||||
|
|
||||||
|
// 更新已收录总数
|
||||||
|
c.visitedMu.RLock()
|
||||||
|
visitedTotal := len(c.visited)
|
||||||
|
c.visitedMu.RUnlock()
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.VisitedTotal = visitedTotal
|
||||||
|
})
|
||||||
|
|
||||||
// 本轮没有发现新链接,爬取结束
|
// 本轮没有发现新链接,爬取结束
|
||||||
if len(newLinks) == 0 {
|
if len(newLinks) == 0 {
|
||||||
log.Println("[crawler] empty queue — stopping")
|
log.Println("[crawler] empty queue — stopping")
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.IsRunning = false
|
||||||
|
})
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
|
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
|
||||||
queue = c.schedule(newLinks)
|
queue = c.schedule(newLinks)
|
||||||
}
|
}
|
||||||
|
// 所有轮次完成,更新状态
|
||||||
|
c.updateCrawlStatus(func(cs *CrawlStatus) {
|
||||||
|
cs.IsRunning = false
|
||||||
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// visitURLRaw 抓取 URL 的核心逻辑,提取标题、描述、正文、子链接。
|
// visitURLRaw 抓取 URL 的核心逻辑,提取标题、描述、正文、子链接。
|
||||||
@@ -1089,3 +1139,17 @@ func (c *Crawler) GetStats() Stats {
|
|||||||
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
|
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// GetCrawlStatus 返回当前爬取状态(供前端监控)。
|
||||||
|
func (c *Crawler) GetCrawlStatus() CrawlStatus {
|
||||||
|
c.crawlStatusMu.RLock()
|
||||||
|
defer c.crawlStatusMu.RUnlock()
|
||||||
|
return c.crawlStatus
|
||||||
|
}
|
||||||
|
|
||||||
|
// updateCrawlStatus 更新爬取状态(内部使用)。
|
||||||
|
func (c *Crawler) updateCrawlStatus(fn func(*CrawlStatus)) {
|
||||||
|
c.crawlStatusMu.Lock()
|
||||||
|
defer c.crawlStatusMu.Unlock()
|
||||||
|
fn(&c.crawlStatus)
|
||||||
|
}
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
Vendored
-2
File diff suppressed because one or more lines are too long
Vendored
+2
File diff suppressed because one or more lines are too long
Vendored
+2
-2
@@ -5,8 +5,8 @@
|
|||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>SESE 爬取管理</title>
|
<title>SESE 爬取管理</title>
|
||||||
<script type="module" crossorigin src="/assets/index-UuqaCCT8.js"></script>
|
<script type="module" crossorigin src="/assets/index-Bk_Z2Kbw.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-Dr22_wUg.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-vFVFq14O.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="app"></div>
|
<div id="app"></div>
|
||||||
|
|||||||
+34
-1
@@ -66,6 +66,7 @@ type Server struct {
|
|||||||
// crawler 爬虫实例(用于 Priority URL 添加时立即触发爬取)
|
// crawler 爬虫实例(用于 Priority URL 添加时立即触发爬取)
|
||||||
crawler interface {
|
crawler interface {
|
||||||
TriggerPriorityCrawl(url string)
|
TriggerPriorityCrawl(url string)
|
||||||
|
GetCrawlStatus() crawler.CrawlStatus
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -95,9 +96,10 @@ func (s *Server) SetBacklinkRunner(r interface {
|
|||||||
s.backlinkRunner = r
|
s.backlinkRunner = r
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetCrawler 注入爬虫实例(用于 Priority URL 添加时立即触发爬取)。
|
// SetCrawler 注入爬虫实例(用于 Priority URL 添加时立即触发爬取和状态查询)。
|
||||||
func (s *Server) SetCrawler(c interface {
|
func (s *Server) SetCrawler(c interface {
|
||||||
TriggerPriorityCrawl(url string)
|
TriggerPriorityCrawl(url string)
|
||||||
|
GetCrawlStatus() crawler.CrawlStatus
|
||||||
}) {
|
}) {
|
||||||
s.crawler = c
|
s.crawler = c
|
||||||
}
|
}
|
||||||
@@ -132,6 +134,7 @@ func (s *Server) Handler() http.Handler {
|
|||||||
mux.HandleFunc("/admin/pending", s.handleAdminPending)
|
mux.HandleFunc("/admin/pending", s.handleAdminPending)
|
||||||
mux.HandleFunc("/admin/workers", s.handleAdminWorkers)
|
mux.HandleFunc("/admin/workers", s.handleAdminWorkers)
|
||||||
mux.HandleFunc("/admin/backlink", s.handleAdminBacklink)
|
mux.HandleFunc("/admin/backlink", s.handleAdminBacklink)
|
||||||
|
mux.HandleFunc("/admin/crawl/status", s.handleAdminCrawlStatus)
|
||||||
// 静态文件(SPA fallback)
|
// 静态文件(SPA fallback)
|
||||||
mux.Handle("/", spaHandler{dist: "dist"})
|
mux.Handle("/", spaHandler{dist: "dist"})
|
||||||
return mux
|
return mux
|
||||||
@@ -705,6 +708,36 @@ func (s *Server) handleAdminBacklink(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// handleAdminCrawlStatus 返回爬虫爬取状态。
|
||||||
|
// GET: 返回当前轮次、总轮数、队列长度、已完成数、已收录总数、是否运行中
|
||||||
|
func (s *Server) handleAdminCrawlStatus(w http.ResponseWriter, r *http.Request) {
|
||||||
|
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||||
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
|
||||||
|
if r.Method != http.MethodGet && r.Method != http.MethodOptions {
|
||||||
|
http.Error(w, `{"error":"method not allowed"}`, 405)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if r.Method == http.MethodOptions {
|
||||||
|
w.WriteHeader(204)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if s.crawler == nil {
|
||||||
|
json.NewEncoder(w).Encode(crawler.CrawlStatus{
|
||||||
|
CurrentEpoch: 0,
|
||||||
|
MaxEpoch: 0,
|
||||||
|
QueueLength: 0,
|
||||||
|
CompletedCount: 0,
|
||||||
|
VisitedTotal: 0,
|
||||||
|
IsRunning: false,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
json.NewEncoder(w).Encode(s.crawler.GetCrawlStatus())
|
||||||
|
}
|
||||||
|
|
||||||
// ---- 搜索处理器 ----
|
// ---- 搜索处理器 ----
|
||||||
|
|
||||||
// searchResponse 是搜索 API 的 JSON 响应结构。
|
// searchResponse 是搜索 API 的 JSON 响应结构。
|
||||||
|
|||||||
+1
-1
Submodule sese-engine-ui updated: 01dcc396f5...c64cdf9f4d
Reference in New Issue
Block a user