修改成redis

This commit is contained in:
2026-04-20 18:26:54 +08:00
parent e944a25e56
commit a9cb0b2481
17 changed files with 2408 additions and 933 deletions
+19 -19
View File
@@ -51,11 +51,11 @@ const (
// Crawler 编排整个 BFS 爬取流程。
type Crawler struct {
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
db *storage.DB // 持久化数据库
analyzer *analyzer.Analyzer // 分词和关键词分析
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
stats Stats // 原子计数器
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
store *storage.RedisStoreV2 // 持久化存储
analyzer *analyzer.Analyzer // 分词和关键词分析
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
stats Stats // 原子计数器
// visited 记录已访问的 URL 集合(跨 epoch 持久,启动时从 DB 预热)
visited map[string]bool
@@ -164,10 +164,10 @@ func DecrementPriorityLevel2Inflight(n int64) {
// New 创建一个 Crawler 实例。
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
func New(store *storage.RedisStoreV2, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
c := &Crawler{
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
db: db,
store: store,
analyzer: a,
prosperMap: prosperMap,
visited: make(map[string]bool),
@@ -192,7 +192,7 @@ func (c *Crawler) warmVisited() {
expired := 0
maxAge := int64(config.RecrawlMaxAge())
now := time.Now().Unix()
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
if now-entry.Timestamp < maxAge {
c.visited[u] = true // 未过期,仍然跳过
count++
@@ -221,7 +221,7 @@ func (c *Crawler) startRecrawlTicker() {
removed := 0
c.visitedMu.Lock()
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
if removed >= batchSize {
return fmt.Errorf("batch full") // 提前终止遍历
}
@@ -365,7 +365,7 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
}()
// 标记 DB 中该 URL 为已访问,防止重启后再次被调度
_ = c.db.MarkPriorityURLVisited(rawURL)
_ = c.store.MarkPriorityURLVisited(rawURL)
// 两级都不限制子链接数量
children := c.visitURLUnlimited(rawURL)
@@ -441,7 +441,7 @@ func (c *Crawler) isVisited(url string) bool {
// 将未访问的插入队列前端(prepend),已爬取的条目从存储中清除。
// 返回本次插入队列的 URL 数量。
func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
entries, err := c.db.GetPriorityURLs()
entries, err := c.store.GetPriorityURLs()
if err != nil || len(entries) == 0 {
return 0
}
@@ -449,14 +449,14 @@ func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
added := 0
for _, e := range entries {
if c.isVisited(e.URL) {
_ = c.db.RemovePriorityURL(e.URL)
_ = c.store.RemovePriorityURL(e.URL)
continue
}
*queue = append([]string{e.URL}, *queue...)
added++
}
_ = c.db.ClearVisitedPriorityURLs()
_ = c.store.ClearVisitedPriorityURLs()
return added
}
@@ -713,7 +713,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
// 增量重爬检测:查询上次爬取的哈希,内容未变则跳过关键词提取
isRecrawl := false
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
oldEntry, _ := c.store.GetSnippet(res.FinalURL)
if !forceIndex && oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
isRecrawl = true
//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
@@ -721,7 +721,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
if len(res.FinalURL) < 250 {
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
_ = c.store.SetSnippet(res.FinalURL, &storage.SnippetEntry{
Title: title,
Description: truncate(desc, 256),
Text: truncate(text, 256),
@@ -756,7 +756,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
if fromHost == "" {
continue
}
_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
_ = c.store.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
@@ -799,7 +799,7 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
if host == "" {
return
}
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
if info.SuccessRate == nil {
zero := 0.0
info.SuccessRate = &zero
@@ -831,7 +831,7 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
}
sampled := sampleStrings(external, 10)
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
// 访问计数 +1,更新最后访问时间
info.VisitCount++
info.LastVisitTime = now
@@ -978,7 +978,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
wg.Add(1)
go func(host string) {
defer wg.Done()
info, _ := c.db.GetSiteInfo(host)
info, _ := c.store.GetSiteInfo(host)
mu.Lock()
siteCache[host] = info
mu.Unlock()
+9 -2
View File
@@ -13,7 +13,8 @@ import (
"sync" // 互斥锁(保护限流表和 robots.txt 缓存)
"time" // 时间(限流间隔计算、robots.txt 缓存过期)
"golang.org/x/net/html/charset" // HTML 字符集自动检测(将各种编码转为 UTF-8)
"golang.org/x/net/html/charset" // HTML 字符集自动检测(将各种编码转为 UTF-8)
"golang.org/x/text/encoding/simplifiedchinese" // GBK → UTF-8 转换兜底
)
// ErrCrawl 表示爬取过程中的预期错误(404、被 robots.txt 禁止、非 HTML 类型等)。
@@ -341,11 +342,17 @@ func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error)
// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
utf8Reader, err := charset.NewReader(reader, contentType)
if err != nil {
// 备选方案:直接以 UTF-8 读取(可能乱码但不崩溃)
// charset 检测失败时,先读取原始字节,再尝试 GBK 兜底
data, readErr := io.ReadAll(reader)
if readErr != nil {
return "", readErr
}
// 将 GBK 字节流转为 UTF-8 字符串
utf8Bytes, convErr := simplifiedchinese.GBK.NewDecoder().Bytes(data)
if convErr == nil {
return string(utf8Bytes), nil
}
// 转换失败则返回原始字节(可能乱码但不崩溃)
return string(data), nil
}
data, err := io.ReadAll(utf8Reader)