From 2ab89b39db63a848ff761a9fc58cc676916a0491 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E6=96=87=E5=B3=B0?= Date: Thu, 9 Apr 2026 16:51:46 +0800 Subject: [PATCH] =?UTF-8?q?=E5=8A=A0=E5=9B=BAsizeLimit=20=E5=85=9C?= =?UTF-8?q?=E5=BA=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- config/config.go | 5 +++++ crawler/crawler.go | 4 ++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/config/config.go b/config/config.go index 7344706..ffb1f17 100644 --- a/config/config.go +++ b/config/config.go @@ -43,6 +43,7 @@ type CrawlerConfig struct { MaxEpoch int `yaml:"max_epoch"` ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"` EntryURL string `yaml:"entry_url"` + MaxPageSize int `yaml:"max_page_size"` // 单个页面最大抓取字节数(0=不限,默认 5MB) } // SearchConfig 搜索结果排序权重配置 @@ -118,6 +119,7 @@ func GetDefaultConfig() Config { MaxEpoch: 100, ExpectedProsperRatio: 0.6, EntryURL: "https://zh.wikipedia.org/", + MaxPageSize: 5 * 1024 * 1024, }, Search: SearchConfig{ UseOnlineSnippet: true, @@ -203,6 +205,9 @@ func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio // EntryURL 返回配置值 func EntryURL() string { return Global.Crawler.EntryURL } +// MaxPageSize 返回单个页面最大抓取字节数(0=不限)。 +func MaxPageSize() int { return Global.Crawler.MaxPageSize } + // UseOnlineSnippet 返回配置值 func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet } diff --git a/crawler/crawler.go b/crawler/crawler.go index 9665765..85e70d6 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -204,8 +204,8 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) { wg.Add(1) go func() { defer wg.Done() - // 礼貌模式抓取(遵守 robots.txt + 限流),不限制大小 - res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, 0) + // 礼貌模式抓取(遵守 robots.txt + 限流),限制页面大小防止内存爆炸 + res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, config.MaxPageSize()) }() waitCh := make(chan struct{}) go func() {