修改成redis

2026-04-20 18:26:54 +08:00
parent e944a25e56
commit a9cb0b2481
17 changed files with 2408 additions and 933 deletions
@@ -51,11 +51,11 @@ const (

 // Crawler 编排整个 BFS 爬取流程。
 type Crawler struct {
-	fetcher    *Fetcher           // HTTP 抓取器（含 robots.txt 和限流）
-	db         *storage.DB        // 持久化数据库
-	analyzer   *analyzer.Analyzer // 分词和关键词分析
-	prosperMap map[string]float64 // 域名 → 反向链接繁荣值（来自 info 模块，越大越"有价值"）
-	stats      Stats              // 原子计数器
+	fetcher    *Fetcher              // HTTP 抓取器（含 robots.txt 和限流）
+	store      *storage.RedisStoreV2 // 持久化存储
+	analyzer   *analyzer.Analyzer    // 分词和关键词分析
+	prosperMap map[string]float64   // 域名 → 反向链接繁荣值（来自 info 模块，越大越"有价值"）
+	stats      Stats                 // 原子计数器

 	// visited 记录已访问的 URL 集合（跨 epoch 持久，启动时从 DB 预热）
 	visited   map[string]bool
@@ -164,10 +164,10 @@ func DecrementPriorityLevel2Inflight(n int64) {

 // New 创建一个 Crawler 实例。
 // prosperMap 由 info 模块加载，传入域名繁荣值用于调度优先级计算。
-func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
+func New(store *storage.RedisStoreV2, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
 	c := &Crawler{
 		fetcher:        NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
-		db:             db,
+		store:          store,
 		analyzer:       a,
 		prosperMap:     prosperMap,
 		visited:        make(map[string]bool),
@@ -192,7 +192,7 @@ func (c *Crawler) warmVisited() {
 	expired := 0
 	maxAge := int64(config.RecrawlMaxAge())
 	now := time.Now().Unix()
-	_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
+	_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
 		if now-entry.Timestamp < maxAge {
 			c.visited[u] = true // 未过期，仍然跳过
 			count++
@@ -221,7 +221,7 @@ func (c *Crawler) startRecrawlTicker() {
 			removed := 0

 			c.visitedMu.Lock()
-			_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
+			_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
 				if removed >= batchSize {
 					return fmt.Errorf("batch full") // 提前终止遍历
 				}
@@ -365,7 +365,7 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
 	}()

 	// 标记 DB 中该 URL 为已访问，防止重启后再次被调度
-	_ = c.db.MarkPriorityURLVisited(rawURL)
+	_ = c.store.MarkPriorityURLVisited(rawURL)

 	// 两级都不限制子链接数量
 	children := c.visitURLUnlimited(rawURL)
@@ -441,7 +441,7 @@ func (c *Crawler) isVisited(url string) bool {
 // 将未访问的插入队列前端（prepend），已爬取的条目从存储中清除。
 // 返回本次插入队列的 URL 数量。
 func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
-	entries, err := c.db.GetPriorityURLs()
+	entries, err := c.store.GetPriorityURLs()
 	if err != nil || len(entries) == 0 {
 		return 0
 	}
@@ -449,14 +449,14 @@ func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
 	added := 0
 	for _, e := range entries {
 		if c.isVisited(e.URL) {
-			_ = c.db.RemovePriorityURL(e.URL)
+			_ = c.store.RemovePriorityURL(e.URL)
 			continue
 		}
 		*queue = append([]string{e.URL}, *queue...)
 		added++
 	}

-	_ = c.db.ClearVisitedPriorityURLs()
+	_ = c.store.ClearVisitedPriorityURLs()
 	return added
 }

@@ -713,7 +713,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text

 	// 增量重爬检测：查询上次爬取的哈希，内容未变则跳过关键词提取
 	isRecrawl := false
-	oldEntry, _ := c.db.GetSnippet(res.FinalURL)
+	oldEntry, _ := c.store.GetSnippet(res.FinalURL)
 	if !forceIndex && oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
 		isRecrawl = true
 		//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
@@ -721,7 +721,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text

 	// 缓存 URL 摘要（仅对短 URL 缓存，防止超长 URL 浪费空间）
 	if len(res.FinalURL) < 250 {
-		_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
+		_ = c.store.SetSnippet(res.FinalURL, &storage.SnippetEntry{
 			Title:       title,
 			Description: truncate(desc, 256),
 			Text:        truncate(text, 256),
@@ -756,7 +756,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
 		if fromHost == "" {
 			continue
 		}
-		_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
+		_ = c.store.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
 			if info.Redirects == nil {
 				info.Redirects = make(map[string]string)
 			}
@@ -799,7 +799,7 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
 	if host == "" {
 		return
 	}
-	_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
+	_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
 		if info.SuccessRate == nil {
 			zero := 0.0
 			info.SuccessRate = &zero
@@ -831,7 +831,7 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
 	}
 	sampled := sampleStrings(external, 10)

-	_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
+	_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
 		// 访问计数 +1，更新最后访问时间
 		info.VisitCount++
 		info.LastVisitTime = now
@@ -978,7 +978,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 		wg.Add(1)
 		go func(host string) {
 			defer wg.Done()
-			info, _ := c.db.GetSiteInfo(host)
+			info, _ := c.store.GetSiteInfo(host)
 			mu.Lock()
 			siteCache[host] = info
 			mu.Unlock()
@@ -13,7 +13,8 @@ import (
 	"sync"      // 互斥锁（保护限流表和 robots.txt 缓存）
 	"time"      // 时间（限流间隔计算、robots.txt 缓存过期）

-	"golang.org/x/net/html/charset" // HTML 字符集自动检测（将各种编码转为 UTF-8）
+	"golang.org/x/net/html/charset"                  // HTML 字符集自动检测（将各种编码转为 UTF-8）
+	"golang.org/x/text/encoding/simplifiedchinese"  // GBK → UTF-8 转换兜底
 )

 // ErrCrawl 表示爬取过程中的预期错误（404、被 robots.txt 禁止、非 HTML 类型等）。
@@ -341,11 +342,17 @@ func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error)
 	// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
 	utf8Reader, err := charset.NewReader(reader, contentType)
 	if err != nil {
-		// 备选方案：直接以 UTF-8 读取（可能乱码但不崩溃）
+		// charset 检测失败时，先读取原始字节，再尝试 GBK 兜底
 		data, readErr := io.ReadAll(reader)
 		if readErr != nil {
 			return "", readErr
 		}
+		// 将 GBK 字节流转为 UTF-8 字符串
+		utf8Bytes, convErr := simplifiedchinese.GBK.NewDecoder().Bytes(data)
+		if convErr == nil {
+			return string(utf8Bytes), nil
+		}
+		// 转换失败则返回原始字节（可能乱码但不崩溃）
 		return string(data), nil
 	}
 	data, err := io.ReadAll(utf8Reader)