修改成redis

2026-04-20 18:26:54 +08:00
parent e944a25e56
commit a9cb0b2481
17 changed files with 2408 additions and 933 deletions
@@ -51,11 +51,11 @@ const (

 // Crawler 编排整个 BFS 爬取流程。
 type Crawler struct {
-	fetcher    *Fetcher           // HTTP 抓取器（含 robots.txt 和限流）
-	db         *storage.DB        // 持久化数据库
-	analyzer   *analyzer.Analyzer // 分词和关键词分析
-	prosperMap map[string]float64 // 域名 → 反向链接繁荣值（来自 info 模块，越大越"有价值"）
-	stats      Stats              // 原子计数器
+	fetcher    *Fetcher              // HTTP 抓取器（含 robots.txt 和限流）
+	store      *storage.RedisStoreV2 // 持久化存储
+	analyzer   *analyzer.Analyzer    // 分词和关键词分析
+	prosperMap map[string]float64   // 域名 → 反向链接繁荣值（来自 info 模块，越大越"有价值"）
+	stats      Stats                 // 原子计数器

 	// visited 记录已访问的 URL 集合（跨 epoch 持久，启动时从 DB 预热）
 	visited   map[string]bool
@@ -164,10 +164,10 @@ func DecrementPriorityLevel2Inflight(n int64) {

 // New 创建一个 Crawler 实例。
 // prosperMap 由 info 模块加载，传入域名繁荣值用于调度优先级计算。
-func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
+func New(store *storage.RedisStoreV2, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
 	c := &Crawler{
 		fetcher:        NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
-		db:             db,
+		store:          store,
 		analyzer:       a,
 		prosperMap:     prosperMap,
 		visited:        make(map[string]bool),
@@ -192,7 +192,7 @@ func (c *Crawler) warmVisited() {
 	expired := 0
 	maxAge := int64(config.RecrawlMaxAge())
 	now := time.Now().Unix()
-	_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
+	_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
 		if now-entry.Timestamp < maxAge {
 			c.visited[u] = true // 未过期，仍然跳过
 			count++
@@ -221,7 +221,7 @@ func (c *Crawler) startRecrawlTicker() {
 			removed := 0

 			c.visitedMu.Lock()
-			_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
+			_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
 				if removed >= batchSize {
 					return fmt.Errorf("batch full") // 提前终止遍历
 				}
@@ -365,7 +365,7 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
 	}()

 	// 标记 DB 中该 URL 为已访问，防止重启后再次被调度
-	_ = c.db.MarkPriorityURLVisited(rawURL)
+	_ = c.store.MarkPriorityURLVisited(rawURL)

 	// 两级都不限制子链接数量
 	children := c.visitURLUnlimited(rawURL)
@@ -441,7 +441,7 @@ func (c *Crawler) isVisited(url string) bool {
 // 将未访问的插入队列前端（prepend），已爬取的条目从存储中清除。
 // 返回本次插入队列的 URL 数量。
 func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
-	entries, err := c.db.GetPriorityURLs()
+	entries, err := c.store.GetPriorityURLs()
 	if err != nil || len(entries) == 0 {
 		return 0
 	}
@@ -449,14 +449,14 @@ func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
 	added := 0
 	for _, e := range entries {
 		if c.isVisited(e.URL) {
-			_ = c.db.RemovePriorityURL(e.URL)
+			_ = c.store.RemovePriorityURL(e.URL)
 			continue
 		}
 		*queue = append([]string{e.URL}, *queue...)
 		added++
 	}

-	_ = c.db.ClearVisitedPriorityURLs()
+	_ = c.store.ClearVisitedPriorityURLs()
 	return added
 }

@@ -713,7 +713,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text

 	// 增量重爬检测：查询上次爬取的哈希，内容未变则跳过关键词提取
 	isRecrawl := false
-	oldEntry, _ := c.db.GetSnippet(res.FinalURL)
+	oldEntry, _ := c.store.GetSnippet(res.FinalURL)
 	if !forceIndex && oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
 		isRecrawl = true
 		//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
@@ -721,7 +721,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text

 	// 缓存 URL 摘要（仅对短 URL 缓存，防止超长 URL 浪费空间）
 	if len(res.FinalURL) < 250 {
-		_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
+		_ = c.store.SetSnippet(res.FinalURL, &storage.SnippetEntry{
 			Title:       title,
 			Description: truncate(desc, 256),
 			Text:        truncate(text, 256),
@@ -756,7 +756,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
 		if fromHost == "" {
 			continue
 		}
-		_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
+		_ = c.store.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
 			if info.Redirects == nil {
 				info.Redirects = make(map[string]string)
 			}
@@ -799,7 +799,7 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
 	if host == "" {
 		return
 	}
-	_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
+	_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
 		if info.SuccessRate == nil {
 			zero := 0.0
 			info.SuccessRate = &zero
@@ -831,7 +831,7 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
 	}
 	sampled := sampleStrings(external, 10)

-	_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
+	_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
 		// 访问计数 +1，更新最后访问时间
 		info.VisitCount++
 		info.LastVisitTime = now
@@ -978,7 +978,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 		wg.Add(1)
 		go func(host string) {
 			defer wg.Done()
-			info, _ := c.db.GetSiteInfo(host)
+			info, _ := c.store.GetSiteInfo(host)
 			mu.Lock()
 			siteCache[host] = info
 			mu.Unlock()