fix 分词bug,添加重爬机制
This commit is contained in:
+83
-12
@@ -7,6 +7,7 @@ import (
|
||||
"context" // context 超时控制
|
||||
"encoding/json" // JSON 序列化(发送关键词数据到收获服务)
|
||||
"fmt" // 格式化(构造目标地址)
|
||||
"hash/fnv" // FNV 哈希(内容变化检测)
|
||||
"log" // 日志输出
|
||||
"math" // 数学运算(指数衰减、质量评分)
|
||||
"math/rand" // 随机数(加权采样、队列打乱)
|
||||
@@ -93,14 +94,58 @@ func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *C
|
||||
}
|
||||
|
||||
// warmVisited 从 DB 的 gate bucket 加载所有已缓存的 URL 到 visited set。
|
||||
// 超过 RecrawlMaxAge 的 URL 不加入 visited,使其可以被重新爬取。
|
||||
func (c *Crawler) warmVisited() {
|
||||
count := 0
|
||||
expired := 0
|
||||
maxAge := int64(config.RecrawlMaxAge())
|
||||
now := time.Now().Unix()
|
||||
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
c.visited[u] = true
|
||||
count++
|
||||
if now-entry.Timestamp < maxAge {
|
||||
c.visited[u] = true // 未过期,仍然跳过
|
||||
count++
|
||||
} else {
|
||||
expired++
|
||||
}
|
||||
return nil
|
||||
})
|
||||
log.Printf("[crawler] visited set warmed: %d URLs loaded", count)
|
||||
log.Printf("[crawler] visited set warmed: %d active, %d expired (eligible for recrawl)", count, expired)
|
||||
}
|
||||
|
||||
// startRecrawlTicker 启动后台定时任务,定期扫描并释放过期 URL 回到候选池。
|
||||
// 已过期的 URL 从 visited map 中移除,使其可以在后续 BFS 轮次中被重新发现和爬取。
|
||||
func (c *Crawler) startRecrawlTicker() {
|
||||
interval := config.RecrawlCheckInterval()
|
||||
if interval <= 0 {
|
||||
return // 未配置或禁用
|
||||
}
|
||||
go func() {
|
||||
ticker := time.NewTicker(time.Duration(interval) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
maxAge := int64(config.RecrawlMaxAge())
|
||||
batchSize := config.RecrawlBatchSize()
|
||||
now := time.Now().Unix()
|
||||
removed := 0
|
||||
|
||||
c.visitedMu.Lock()
|
||||
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
if removed >= batchSize {
|
||||
return fmt.Errorf("batch full") // 提前终止遍历
|
||||
}
|
||||
if now-entry.Timestamp >= maxAge && c.visited[u] {
|
||||
delete(c.visited, u)
|
||||
removed++
|
||||
}
|
||||
return nil
|
||||
})
|
||||
c.visitedMu.Unlock()
|
||||
|
||||
if removed > 0 {
|
||||
log.Printf("[crawler] recrawl ticker: released %d expired URLs back to pool", removed)
|
||||
}
|
||||
}
|
||||
}()
|
||||
}
|
||||
|
||||
// markVisited 将 URL 标记为已访问(线程安全)。
|
||||
@@ -154,6 +199,9 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
c.markVisited(entryURL)
|
||||
queue := []string{entryURL}
|
||||
|
||||
// 启动后台重爬定时器:定期释放过期 URL 到候选池
|
||||
c.startRecrawlTicker()
|
||||
|
||||
for ep := 0; ep < maxEpoch; ep++ {
|
||||
// 每轮 epoch 从 config 读取最新 workers 值,支持运行时动态调整
|
||||
workers := config.CrawlerWorkers()
|
||||
@@ -263,6 +311,17 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
// 解析 HTML:提取标题、描述、正文和所有超链接
|
||||
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
|
||||
|
||||
// 计算正文内容哈希(FNV-1a),用于增量重爬检测
|
||||
contentHash := fnvHash(text)
|
||||
|
||||
// 增量重爬检测:查询上次爬取的哈希,内容未变则跳过关键词提取
|
||||
isRecrawl := false
|
||||
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
|
||||
if oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
|
||||
isRecrawl = true
|
||||
log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
||||
}
|
||||
|
||||
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||
if len(res.FinalURL) < 250 {
|
||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
@@ -270,20 +329,24 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
Description: truncate(desc, 256),
|
||||
Text: truncate(text, 256),
|
||||
Timestamp: time.Now().Unix(),
|
||||
ContentHash: contentHash,
|
||||
})
|
||||
}
|
||||
|
||||
// 关键词提取:将标题/描述/正文交给 analyzer 计算关键词权重
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
// 限制每个页面最多发送的关键词数量
|
||||
maxKws := config.MaxKeywordsPerPage()
|
||||
if len(kws) > maxKws {
|
||||
kws = kws[:maxKws]
|
||||
// 增量优化:如果内容未变化(重爬),跳过关键词提取和索引更新
|
||||
if !isRecrawl {
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
// 限制每个页面最多发送的关键词数量
|
||||
maxKws := config.MaxKeywordsPerPage()
|
||||
if len(kws) > maxKws {
|
||||
kws = kws[:maxKws]
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||
go c.sendToHarvester(res.FinalURL, kws)
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||
go c.sendToHarvester(res.FinalURL, kws)
|
||||
}
|
||||
|
||||
// 更新网站元信息(成功访问)
|
||||
@@ -671,6 +734,14 @@ func truncate(s string, n int) string {
|
||||
return s[:n]
|
||||
}
|
||||
|
||||
// fnvHash 使用 FNV-1a 算法计算字符串的哈希值(十六进制字符串)。
|
||||
// 用于增量重爬时检测页面正文是否发生变化。
|
||||
func fnvHash(s string) string {
|
||||
h := fnv.New128a()
|
||||
h.Write([]byte(s))
|
||||
return fmt.Sprintf("%x", h.Sum(nil))
|
||||
}
|
||||
|
||||
// sampleStrings 从字符串切片中随机不重复抽取 n 条。
|
||||
func sampleStrings(s []string, n int) []string {
|
||||
if len(s) <= n {
|
||||
|
||||
Reference in New Issue
Block a user