diff --git a/config/config.go b/config/config.go index 7344706..ffb1f17 100644 --- a/config/config.go +++ b/config/config.go @@ -43,6 +43,7 @@ type CrawlerConfig struct { MaxEpoch int `yaml:"max_epoch"` ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"` EntryURL string `yaml:"entry_url"` + MaxPageSize int `yaml:"max_page_size"` // 单个页面最大抓取字节数(0=不限,默认 5MB) } // SearchConfig 搜索结果排序权重配置 @@ -118,6 +119,7 @@ func GetDefaultConfig() Config { MaxEpoch: 100, ExpectedProsperRatio: 0.6, EntryURL: "https://zh.wikipedia.org/", + MaxPageSize: 5 * 1024 * 1024, }, Search: SearchConfig{ UseOnlineSnippet: true, @@ -203,6 +205,9 @@ func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio // EntryURL 返回配置值 func EntryURL() string { return Global.Crawler.EntryURL } +// MaxPageSize 返回单个页面最大抓取字节数(0=不限)。 +func MaxPageSize() int { return Global.Crawler.MaxPageSize } + // UseOnlineSnippet 返回配置值 func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet } diff --git a/crawler/crawler.go b/crawler/crawler.go index 9665765..85e70d6 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -204,8 +204,8 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) { wg.Add(1) go func() { defer wg.Done() - // 礼貌模式抓取(遵守 robots.txt + 限流),不限制大小 - res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, 0) + // 礼貌模式抓取(遵守 robots.txt + 限流),限制页面大小防止内存爆炸 + res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, config.MaxPageSize()) }() waitCh := make(chan struct{}) go func() {