修复排序问题和初始入口问题
This commit is contained in:
Vendored
-7
File diff suppressed because one or more lines are too long
Vendored
+7
File diff suppressed because one or more lines are too long
Vendored
+1
-1
@@ -5,7 +5,7 @@
|
|||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>SESE 爬取管理</title>
|
<title>SESE 爬取管理</title>
|
||||||
<script type="module" crossorigin src="/assets/index-CbQDv6fc.js"></script>
|
<script type="module" crossorigin src="/assets/index-Df7HB0Xa.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-c8sW61xI.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-c8sW61xI.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
|
|||||||
@@ -58,14 +58,14 @@ index:
|
|||||||
|
|
||||||
# 爬虫行为相关配置
|
# 爬虫行为相关配置
|
||||||
crawler:
|
crawler:
|
||||||
spider_name: "loli_spider" # HTTP 请求的 User-Agent 标识
|
spider_name: "Haibara_AI_spider" # HTTP 请求的 User-Agent 标识
|
||||||
cooldown: 3 # 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
cooldown: 3 # 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
||||||
workers: 22 # 爬虫并发 goroutine 数量
|
workers: 22 # 爬虫并发 goroutine 数量
|
||||||
crawl_focus: 0.7 # 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
crawl_focus: 0.7 # 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
||||||
max_keywords_per_page: 250 # 单个页面最多提取的关键词数量
|
max_keywords_per_page: 250 # 单个页面最多提取的关键词数量
|
||||||
max_epoch: 100 # BFS 爬取的最大轮次上限
|
max_epoch: 100 # BFS 爬取的最大轮次上限
|
||||||
expected_prosper_ratio: 0.6 # 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
expected_prosper_ratio: 0.6 # 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
||||||
entry_url: "https://zh.wikipedia.org/" # BFS 爬取的起始入口 URL
|
entry_url: "https://haibara.ai/" # BFS 爬取的起始入口 URL
|
||||||
max_page_size: 5242880 # 单个页面最大抓取字节数(0=不限,默认 5MB)
|
max_page_size: 5242880 # 单个页面最大抓取字节数(0=不限,默认 5MB)
|
||||||
recrawl_max_age: 2592000 # URL 过期时间(秒),超过此时间的 URL 允许被重爬,默认 30 天
|
recrawl_max_age: 2592000 # URL 过期时间(秒),超过此时间的 URL 允许被重爬,默认 30 天
|
||||||
recrawl_check_interval: 3600 # 运行期间检查过期 URL 的间隔(秒),默认 1 小时
|
recrawl_check_interval: 3600 # 运行期间检查过期 URL 的间隔(秒),默认 1 小时
|
||||||
|
|||||||
+5
-5
@@ -122,7 +122,7 @@ func (c *urlKeywordsCache) ListAll() []*urlKeywordsEntry {
|
|||||||
return entries
|
return entries
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListPage 返回分页缓存条目(按访问时间从旧到新,支持跳过头部条目)
|
// ListPage 返回分页缓存条目(按访问时间从新到旧,最新访问的在前)
|
||||||
func (c *urlKeywordsCache) ListPage(page, pageSize int) []*urlKeywordsEntry {
|
func (c *urlKeywordsCache) ListPage(page, pageSize int) []*urlKeywordsEntry {
|
||||||
c.mu.RLock()
|
c.mu.RLock()
|
||||||
defer c.mu.RUnlock()
|
defer c.mu.RUnlock()
|
||||||
@@ -131,16 +131,16 @@ func (c *urlKeywordsCache) ListPage(page, pageSize int) []*urlKeywordsEntry {
|
|||||||
if offset >= total {
|
if offset >= total {
|
||||||
return []*urlKeywordsEntry{}
|
return []*urlKeywordsEntry{}
|
||||||
}
|
}
|
||||||
// 遍历到起始位置
|
// 从 Back(最新)向前遍历,跳过 offset 个
|
||||||
elem := c.order.Front()
|
elem := c.order.Back()
|
||||||
for i := 0; i < offset && elem != nil; i++ {
|
for i := 0; i < offset && elem != nil; i++ {
|
||||||
elem = elem.Next()
|
elem = elem.Prev()
|
||||||
}
|
}
|
||||||
// 收集 pageSize 条
|
// 收集 pageSize 条
|
||||||
entries := make([]*urlKeywordsEntry, 0, pageSize)
|
entries := make([]*urlKeywordsEntry, 0, pageSize)
|
||||||
for i := 0; i < pageSize && elem != nil; i++ {
|
for i := 0; i < pageSize && elem != nil; i++ {
|
||||||
entries = append(entries, elem.Value.(*urlKeywordsEntry))
|
entries = append(entries, elem.Value.(*urlKeywordsEntry))
|
||||||
elem = elem.Next()
|
elem = elem.Prev()
|
||||||
}
|
}
|
||||||
return entries
|
return entries
|
||||||
}
|
}
|
||||||
|
|||||||
+1
-1
Submodule sese-engine-ui updated: ded160083f...5777561d9f
Reference in New Issue
Block a user