加固sizeLimit 兜底

This commit is contained in:
2026-04-09 16:51:46 +08:00
parent 4137343fe7
commit 2ab89b39db
2 changed files with 7 additions and 2 deletions
+5
View File
@@ -43,6 +43,7 @@ type CrawlerConfig struct {
MaxEpoch int `yaml:"max_epoch"` MaxEpoch int `yaml:"max_epoch"`
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"` ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
EntryURL string `yaml:"entry_url"` EntryURL string `yaml:"entry_url"`
MaxPageSize int `yaml:"max_page_size"` // 单个页面最大抓取字节数(0=不限,默认 5MB)
} }
// SearchConfig 搜索结果排序权重配置 // SearchConfig 搜索结果排序权重配置
@@ -118,6 +119,7 @@ func GetDefaultConfig() Config {
MaxEpoch: 100, MaxEpoch: 100,
ExpectedProsperRatio: 0.6, ExpectedProsperRatio: 0.6,
EntryURL: "https://zh.wikipedia.org/", EntryURL: "https://zh.wikipedia.org/",
MaxPageSize: 5 * 1024 * 1024,
}, },
Search: SearchConfig{ Search: SearchConfig{
UseOnlineSnippet: true, UseOnlineSnippet: true,
@@ -203,6 +205,9 @@ func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio
// EntryURL 返回配置值 // EntryURL 返回配置值
func EntryURL() string { return Global.Crawler.EntryURL } func EntryURL() string { return Global.Crawler.EntryURL }
// MaxPageSize 返回单个页面最大抓取字节数(0=不限)。
func MaxPageSize() int { return Global.Crawler.MaxPageSize }
// UseOnlineSnippet 返回配置值 // UseOnlineSnippet 返回配置值
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet } func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
+2 -2
View File
@@ -204,8 +204,8 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
wg.Add(1) wg.Add(1)
go func() { go func() {
defer wg.Done() defer wg.Done()
// 礼貌模式抓取(遵守 robots.txt + 限流),限制大小 // 礼貌模式抓取(遵守 robots.txt + 限流),限制页面大小防止内存爆炸
res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, 0) res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, config.MaxPageSize())
}() }()
waitCh := make(chan struct{}) waitCh := make(chan struct{})
go func() { go func() {