修改成redis
This commit is contained in:
+19
-19
@@ -51,11 +51,11 @@ const (
|
||||
|
||||
// Crawler 编排整个 BFS 爬取流程。
|
||||
type Crawler struct {
|
||||
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
|
||||
db *storage.DB // 持久化数据库
|
||||
analyzer *analyzer.Analyzer // 分词和关键词分析
|
||||
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
|
||||
stats Stats // 原子计数器
|
||||
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
|
||||
store *storage.RedisStoreV2 // 持久化存储
|
||||
analyzer *analyzer.Analyzer // 分词和关键词分析
|
||||
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
|
||||
stats Stats // 原子计数器
|
||||
|
||||
// visited 记录已访问的 URL 集合(跨 epoch 持久,启动时从 DB 预热)
|
||||
visited map[string]bool
|
||||
@@ -164,10 +164,10 @@ func DecrementPriorityLevel2Inflight(n int64) {
|
||||
|
||||
// New 创建一个 Crawler 实例。
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
func New(store *storage.RedisStoreV2, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
c := &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||
db: db,
|
||||
store: store,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
visited: make(map[string]bool),
|
||||
@@ -192,7 +192,7 @@ func (c *Crawler) warmVisited() {
|
||||
expired := 0
|
||||
maxAge := int64(config.RecrawlMaxAge())
|
||||
now := time.Now().Unix()
|
||||
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
if now-entry.Timestamp < maxAge {
|
||||
c.visited[u] = true // 未过期,仍然跳过
|
||||
count++
|
||||
@@ -221,7 +221,7 @@ func (c *Crawler) startRecrawlTicker() {
|
||||
removed := 0
|
||||
|
||||
c.visitedMu.Lock()
|
||||
_ = c.db.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
_ = c.store.ForEachSnippet(func(u string, entry *storage.SnippetEntry) error {
|
||||
if removed >= batchSize {
|
||||
return fmt.Errorf("batch full") // 提前终止遍历
|
||||
}
|
||||
@@ -365,7 +365,7 @@ func (c *Crawler) priorityCrawlLoop(rawURL string, level int) {
|
||||
}()
|
||||
|
||||
// 标记 DB 中该 URL 为已访问,防止重启后再次被调度
|
||||
_ = c.db.MarkPriorityURLVisited(rawURL)
|
||||
_ = c.store.MarkPriorityURLVisited(rawURL)
|
||||
|
||||
// 两级都不限制子链接数量
|
||||
children := c.visitURLUnlimited(rawURL)
|
||||
@@ -441,7 +441,7 @@ func (c *Crawler) isVisited(url string) bool {
|
||||
// 将未访问的插入队列前端(prepend),已爬取的条目从存储中清除。
|
||||
// 返回本次插入队列的 URL 数量。
|
||||
func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
|
||||
entries, err := c.db.GetPriorityURLs()
|
||||
entries, err := c.store.GetPriorityURLs()
|
||||
if err != nil || len(entries) == 0 {
|
||||
return 0
|
||||
}
|
||||
@@ -449,14 +449,14 @@ func (c *Crawler) fetchAndApplyPriorityURLs(queue *[]string) int {
|
||||
added := 0
|
||||
for _, e := range entries {
|
||||
if c.isVisited(e.URL) {
|
||||
_ = c.db.RemovePriorityURL(e.URL)
|
||||
_ = c.store.RemovePriorityURL(e.URL)
|
||||
continue
|
||||
}
|
||||
*queue = append([]string{e.URL}, *queue...)
|
||||
added++
|
||||
}
|
||||
|
||||
_ = c.db.ClearVisitedPriorityURLs()
|
||||
_ = c.store.ClearVisitedPriorityURLs()
|
||||
return added
|
||||
}
|
||||
|
||||
@@ -713,7 +713,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
|
||||
|
||||
// 增量重爬检测:查询上次爬取的哈希,内容未变则跳过关键词提取
|
||||
isRecrawl := false
|
||||
oldEntry, _ := c.db.GetSnippet(res.FinalURL)
|
||||
oldEntry, _ := c.store.GetSnippet(res.FinalURL)
|
||||
if !forceIndex && oldEntry != nil && oldEntry.ContentHash != "" && oldEntry.ContentHash == contentHash {
|
||||
isRecrawl = true
|
||||
//log.Printf("[crawler] unchanged (recrawl skip): %s", res.FinalURL)
|
||||
@@ -721,7 +721,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
|
||||
|
||||
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||
if len(res.FinalURL) < 250 {
|
||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
_ = c.store.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
Title: title,
|
||||
Description: truncate(desc, 256),
|
||||
Text: truncate(text, 256),
|
||||
@@ -756,7 +756,7 @@ func (c *Crawler) visitURLRaw(rawURL string, forceIndex bool) (title, desc, text
|
||||
if fromHost == "" {
|
||||
continue
|
||||
}
|
||||
_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
|
||||
_ = c.store.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
@@ -799,7 +799,7 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
if host == "" {
|
||||
return
|
||||
}
|
||||
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
if info.SuccessRate == nil {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
@@ -831,7 +831,7 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
||||
}
|
||||
sampled := sampleStrings(external, 10)
|
||||
|
||||
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
_ = c.store.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
// 访问计数 +1,更新最后访问时间
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = now
|
||||
@@ -978,7 +978,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
wg.Add(1)
|
||||
go func(host string) {
|
||||
defer wg.Done()
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
info, _ := c.store.GetSiteInfo(host)
|
||||
mu.Lock()
|
||||
siteCache[host] = info
|
||||
mu.Unlock()
|
||||
|
||||
+9
-2
@@ -13,7 +13,8 @@ import (
|
||||
"sync" // 互斥锁(保护限流表和 robots.txt 缓存)
|
||||
"time" // 时间(限流间隔计算、robots.txt 缓存过期)
|
||||
|
||||
"golang.org/x/net/html/charset" // HTML 字符集自动检测(将各种编码转为 UTF-8)
|
||||
"golang.org/x/net/html/charset" // HTML 字符集自动检测(将各种编码转为 UTF-8)
|
||||
"golang.org/x/text/encoding/simplifiedchinese" // GBK → UTF-8 转换兜底
|
||||
)
|
||||
|
||||
// ErrCrawl 表示爬取过程中的预期错误(404、被 robots.txt 禁止、非 HTML 类型等)。
|
||||
@@ -341,11 +342,17 @@ func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error)
|
||||
// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
|
||||
utf8Reader, err := charset.NewReader(reader, contentType)
|
||||
if err != nil {
|
||||
// 备选方案:直接以 UTF-8 读取(可能乱码但不崩溃)
|
||||
// charset 检测失败时,先读取原始字节,再尝试 GBK 兜底
|
||||
data, readErr := io.ReadAll(reader)
|
||||
if readErr != nil {
|
||||
return "", readErr
|
||||
}
|
||||
// 将 GBK 字节流转为 UTF-8 字符串
|
||||
utf8Bytes, convErr := simplifiedchinese.GBK.NewDecoder().Bytes(data)
|
||||
if convErr == nil {
|
||||
return string(utf8Bytes), nil
|
||||
}
|
||||
// 转换失败则返回原始字节(可能乱码但不崩溃)
|
||||
return string(data), nil
|
||||
}
|
||||
data, err := io.ReadAll(utf8Reader)
|
||||
|
||||
Reference in New Issue
Block a user