加固sizeLimit 兜底
This commit is contained in:
@@ -43,6 +43,7 @@ type CrawlerConfig struct {
|
||||
MaxEpoch int `yaml:"max_epoch"`
|
||||
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
||||
EntryURL string `yaml:"entry_url"`
|
||||
MaxPageSize int `yaml:"max_page_size"` // 单个页面最大抓取字节数(0=不限,默认 5MB)
|
||||
}
|
||||
|
||||
// SearchConfig 搜索结果排序权重配置
|
||||
@@ -118,6 +119,7 @@ func GetDefaultConfig() Config {
|
||||
MaxEpoch: 100,
|
||||
ExpectedProsperRatio: 0.6,
|
||||
EntryURL: "https://zh.wikipedia.org/",
|
||||
MaxPageSize: 5 * 1024 * 1024,
|
||||
},
|
||||
Search: SearchConfig{
|
||||
UseOnlineSnippet: true,
|
||||
@@ -203,6 +205,9 @@ func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio
|
||||
// EntryURL 返回配置值
|
||||
func EntryURL() string { return Global.Crawler.EntryURL }
|
||||
|
||||
// MaxPageSize 返回单个页面最大抓取字节数(0=不限)。
|
||||
func MaxPageSize() int { return Global.Crawler.MaxPageSize }
|
||||
|
||||
// UseOnlineSnippet 返回配置值
|
||||
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
||||
|
||||
|
||||
+2
-2
@@ -204,8 +204,8 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
// 礼貌模式抓取(遵守 robots.txt + 限流),不限制大小
|
||||
res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, 0)
|
||||
// 礼貌模式抓取(遵守 robots.txt + 限流),限制页面大小防止内存爆炸
|
||||
res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, config.MaxPageSize())
|
||||
}()
|
||||
waitCh := make(chan struct{})
|
||||
go func() {
|
||||
|
||||
Reference in New Issue
Block a user