修改成redis
This commit is contained in:
+19
-19
@@ -51,11 +51,11 @@ const (
|
||||
|
||||
// Crawler 编排整个 BFS 爬取流程。
|
||||
type Crawler struct {
|
||||
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
|
||||
db *storage.DB // 持久化数据库
|
||||
analyzer *analyzer.Analyzer // 分词和关键词分析
|
||||
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
|
||||
stats Stats // 原子计数器
|
||||
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
|
||||
store *storage.RedisStoreV2 // 持久化存储
|
||||
analyzer *analyzer.Analyzer // 分词和关键词分析
|
||||
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
|
||||
stats Stats // 原子计数器
|
||||
|
||||
// visited 记录已访问的 URL 集合(跨 epoch 持久,启动时从 DB 预热)
|
||||
visited map[string]bool
|
||||
@@ -164,10 +164,10 @@ func DecrementPriorityLevel2Inflight(n int64) {
|
||||
|
||||
// New 创建一个 Crawler 实例。
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
func New(store *storage.RedisStoreV2, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
c := &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||
db: db,
|
||||
store: store,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
visited: make(map[string]bool),
|
||||
@@ -192,7 +192,7 @@ func (c *Crawler) warmVisited() {
|
||||
expired := 0
|
||||
maxAge := int64(config.RecrawlMaxAge())
|
||||
now := time.Now().Unix()
|
||||
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
if now-entry.Timestamp < maxAge {
|
||||
c.visited[u] = true // 未过期,仍然跳过
|
||||
count++
|
||||
@@ -221,7 +221,7 @@ func (c *Crawler) startRecrawlTicker() {
|
||||
removed := 0
|
||||
|
||||
c.visitedMu.Lock()
|
||||
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
if removed >= batchSize {
|
||||
return fmt.Errorf("batch full") // 提前终止遍历
|
||||
}
|
||||
@@ -365,7 +365,7 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
||||
}()
|
||||
|
||||
// 标记 DB 中该 URL 为已访问,防止重启后再次被调度
|
||||
_ = c.db.MarkPriorityURLVisited(rawURL)
|
||||
_ = c.store.MarkPriorityURLVisited(rawURL)
|
||||
|
||||
// 两级都不限制子链接数量
|
||||
children := c.visitURLUnlimited(rawURL)
|
||||
@@ -441,7 +441,7 @@ func (c *Crawler) isVisited(url string) bool {
|
||||
// 将未访问的插入队列前端(prepend),已爬取的条目从存储中清除。
|
||||
// 返回本次插入队列的 URL 数量。
|
||||
func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
|
||||
entries, err := c.db.GetPriorityURLs()
|
||||
entries, err := c.store.GetPriorityURLs()
|
||||
if err != nil || len(entries) == 0 {
|
||||
return 0
|
||||
}
|
||||
@@ -449,14 +449,14 @@ func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
|
||||
added := 0
|
||||
for _, e := range entries {
|
||||
if c.isVisited(e.URL) {
|
||||
_ = c.db.RemovePriorityURL(e.URL)
|
||||
_ = c.store.RemovePriorityURL(e.URL)
|
||||
continue
|
||||
}
|
||||
*queue = append([]string{e.URL}, *queue...)
|
||||
added++
|
||||
}
|
||||
|
||||
_ = c.db.ClearVisitedPriorityURLs()
|
||||
_ = c.store.ClearVisitedPriorityURLs()
|
||||
return added
|
||||
}
|
||||
|
||||
@@ -713,7 +713,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
|
||||
|
||||
// 增量重爬检测:查询上次爬取的哈希,内容未变则跳过关键词提取
|
||||
isRecrawl := false
|
||||
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
|
||||
oldEntry, _ := c.store.GetSnippet(res.FinalURL)
|
||||
if !forceIndex && oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
|
||||
isRecrawl = true
|
||||
//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
||||
@@ -721,7 +721,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
|
||||
|
||||
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||
if len(res.FinalURL) < 250 {
|
||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
_ = c.store.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
Title: title,
|
||||
Description: truncate(desc, 256),
|
||||
Text: truncate(text, 256),
|
||||
@@ -756,7 +756,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
|
||||
if fromHost == "" {
|
||||
continue
|
||||
}
|
||||
_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
|
||||
_ = c.store.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
@@ -799,7 +799,7 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
if host == "" {
|
||||
return
|
||||
}
|
||||
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
if info.SuccessRate == nil {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
@@ -831,7 +831,7 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
||||
}
|
||||
sampled := sampleStrings(external, 10)
|
||||
|
||||
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
// 访问计数 +1,更新最后访问时间
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = now
|
||||
@@ -978,7 +978,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
wg.Add(1)
|
||||
go func(host string) {
|
||||
defer wg.Done()
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
info, _ := c.store.GetSiteInfo(host)
|
||||
mu.Lock()
|
||||
siteCache[host] = info
|
||||
mu.Unlock()
|
||||
|
||||
Reference in New Issue
Block a user