可修改线程
This commit is contained in:
@@ -54,6 +54,23 @@ type Crawler struct {
|
|||||||
circuitState int32 // circuitClosed | circuitOpen | circuitHalfOpen
|
circuitState int32 // circuitClosed | circuitOpen | circuitHalfOpen
|
||||||
circuitFailures int32 // 连续失败计数(atomic)
|
circuitFailures int32 // 连续失败计数(atomic)
|
||||||
circuitExpiry int64 // 熔断/半开截止 Unix 时间戳(秒)
|
circuitExpiry int64 // 熔断/半开截止 Unix 时间戳(秒)
|
||||||
|
|
||||||
|
// 运行时活跃线程计数(atomic,每轮 epoch 自动归零前重新开始计数)
|
||||||
|
activeWorkers int64
|
||||||
|
}
|
||||||
|
|
||||||
|
// 全局活跃线程计数器(跨包可读,无需持有 Crawler 引用)
|
||||||
|
var globalActiveWorkers int64
|
||||||
|
|
||||||
|
// ActiveWorkers 返回当前正在运行的爬虫 goroutine 数量。
|
||||||
|
// 也可通过包级函数 GlobalActiveWorkers() 读取(供 search 等外部包使用)。
|
||||||
|
func (c *Crawler) ActiveWorkers() int64 {
|
||||||
|
return atomic.LoadInt64(&c.activeWorkers)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GlobalActiveWorkers 返回当前全局活跃爬虫 goroutine 数量(包级,外部包可直接调用)。
|
||||||
|
func GlobalActiveWorkers() int64 {
|
||||||
|
return atomic.LoadInt64(&globalActiveWorkers)
|
||||||
}
|
}
|
||||||
|
|
||||||
// New 创建一个 Crawler 实例。
|
// New 创建一个 Crawler 实例。
|
||||||
@@ -131,9 +148,13 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
for _, u := range queue {
|
for _, u := range queue {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||||
|
atomic.AddInt64(&c.activeWorkers, 1)
|
||||||
|
atomic.AddInt64(&globalActiveWorkers, 1)
|
||||||
go func(rawURL string) {
|
go func(rawURL string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
defer func() { <-sem }() // 释放令牌
|
defer func() { <-sem }() // 释放令牌
|
||||||
|
defer atomic.AddInt64(&c.activeWorkers, -1)
|
||||||
|
defer atomic.AddInt64(&globalActiveWorkers, -1)
|
||||||
|
|
||||||
// 抓取单个 URL,返回发现的子链接
|
// 抓取单个 URL,返回发现的子链接
|
||||||
hrefs := c.visitURL(rawURL)
|
hrefs := c.visitURL(rawURL)
|
||||||
|
|||||||
Vendored
+2
File diff suppressed because one or more lines are too long
Vendored
-2
File diff suppressed because one or more lines are too long
Vendored
-6
File diff suppressed because one or more lines are too long
Vendored
-6
File diff suppressed because one or more lines are too long
Vendored
-2
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Vendored
-6
File diff suppressed because one or more lines are too long
Vendored
-6
File diff suppressed because one or more lines are too long
Vendored
-6
File diff suppressed because one or more lines are too long
Vendored
-2
File diff suppressed because one or more lines are too long
Vendored
-6
File diff suppressed because one or more lines are too long
Vendored
+2
-2
@@ -5,8 +5,8 @@
|
|||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>SESE 爬取管理</title>
|
<title>SESE 爬取管理</title>
|
||||||
<script type="module" crossorigin src="/assets/index-BomiJv32.js"></script>
|
<script type="module" crossorigin src="/assets/index-DXbMW8tX.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-Bj4UMEhQ.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-Bbt6GbEM.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="app"></div>
|
<div id="app"></div>
|
||||||
|
|||||||
+6
-2
@@ -23,6 +23,7 @@ import (
|
|||||||
|
|
||||||
"sese-engine/analyzer" // 分词和语言检测
|
"sese-engine/analyzer" // 分词和语言检测
|
||||||
"sese-engine/config" // 排序权重配置
|
"sese-engine/config" // 排序权重配置
|
||||||
|
"sese-engine/crawler" // 爬虫(读取活跃线程数)
|
||||||
"sese-engine/info" // info 服务
|
"sese-engine/info" // info 服务
|
||||||
"sese-engine/parser" // HTML 解析(在线摘要)
|
"sese-engine/parser" // HTML 解析(在线摘要)
|
||||||
"sese-engine/storage" // 持久化存储
|
"sese-engine/storage" // 持久化存储
|
||||||
@@ -410,7 +411,7 @@ func (s *Server) handleAdminPending(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// handleAdminWorkers 查看和动态调整爬虫并发线程数。
|
// handleAdminWorkers 查看和动态调整爬虫并发线程数。
|
||||||
// GET 返回当前 workers 值
|
// GET 返回 configured(设定值)和 active(实际运行中的 goroutine 数)
|
||||||
// POST {"workers": N} 动态修改(范围 1~500),下一轮 epoch 立即生效
|
// POST {"workers": N} 动态修改(范围 1~500),下一轮 epoch 立即生效
|
||||||
func (s *Server) handleAdminWorkers(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleAdminWorkers(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||||
@@ -425,7 +426,10 @@ func (s *Server) handleAdminWorkers(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
|
|
||||||
case http.MethodGet:
|
case http.MethodGet:
|
||||||
json.NewEncoder(w).Encode(map[string]int{"workers": config.CrawlerWorkers()})
|
json.NewEncoder(w).Encode(map[string]int64{
|
||||||
|
"configured": int64(config.CrawlerWorkers()),
|
||||||
|
"active": crawler.GlobalActiveWorkers(),
|
||||||
|
})
|
||||||
|
|
||||||
case http.MethodPost:
|
case http.MethodPost:
|
||||||
var body struct {
|
var body struct {
|
||||||
|
|||||||
Reference in New Issue
Block a user