加固sizeLimit 兜底
This commit is contained in:
@@ -43,6 +43,7 @@ type CrawlerConfig struct {
|
|||||||
MaxEpoch int `yaml:"max_epoch"`
|
MaxEpoch int `yaml:"max_epoch"`
|
||||||
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
||||||
EntryURL string `yaml:"entry_url"`
|
EntryURL string `yaml:"entry_url"`
|
||||||
|
MaxPageSize int `yaml:"max_page_size"` // 单个页面最大抓取字节数(0=不限,默认 5MB)
|
||||||
}
|
}
|
||||||
|
|
||||||
// SearchConfig 搜索结果排序权重配置
|
// SearchConfig 搜索结果排序权重配置
|
||||||
@@ -118,6 +119,7 @@ func GetDefaultConfig() Config {
|
|||||||
MaxEpoch: 100,
|
MaxEpoch: 100,
|
||||||
ExpectedProsperRatio: 0.6,
|
ExpectedProsperRatio: 0.6,
|
||||||
EntryURL: "https://zh.wikipedia.org/",
|
EntryURL: "https://zh.wikipedia.org/",
|
||||||
|
MaxPageSize: 5 * 1024 * 1024,
|
||||||
},
|
},
|
||||||
Search: SearchConfig{
|
Search: SearchConfig{
|
||||||
UseOnlineSnippet: true,
|
UseOnlineSnippet: true,
|
||||||
@@ -203,6 +205,9 @@ func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio
|
|||||||
// EntryURL 返回配置值
|
// EntryURL 返回配置值
|
||||||
func EntryURL() string { return Global.Crawler.EntryURL }
|
func EntryURL() string { return Global.Crawler.EntryURL }
|
||||||
|
|
||||||
|
// MaxPageSize 返回单个页面最大抓取字节数(0=不限)。
|
||||||
|
func MaxPageSize() int { return Global.Crawler.MaxPageSize }
|
||||||
|
|
||||||
// UseOnlineSnippet 返回配置值
|
// UseOnlineSnippet 返回配置值
|
||||||
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
||||||
|
|
||||||
|
|||||||
+2
-2
@@ -204,8 +204,8 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
|||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
// 礼貌模式抓取(遵守 robots.txt + 限流),不限制大小
|
// 礼貌模式抓取(遵守 robots.txt + 限流),限制页面大小防止内存爆炸
|
||||||
res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, 0)
|
res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, config.MaxPageSize())
|
||||||
}()
|
}()
|
||||||
waitCh := make(chan struct{})
|
waitCh := make(chan struct{})
|
||||||
go func() {
|
go func() {
|
||||||
|
|||||||
Reference in New Issue
Block a user