无法正常退出,但也能用
This commit is contained in:
+33
-6
@@ -162,14 +162,42 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
}
|
||||
|
||||
// visitURL 抓取一个 URL,提取关键词、缓存摘要、更新网站元信息,返回页面中发现的子链接。
|
||||
func (c *Crawler) visitURL(rawURL string) []string {
|
||||
func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
// recover 保护:防止任何模块(analyzer/storage/parser)的 panic 杀死 goroutine
|
||||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Printf("[crawler] visitURL panic recovered: url=%s error=%v", rawURL, r)
|
||||
}
|
||||
}()
|
||||
atomic.AddInt64(&c.stats.VisitedURLs, 1) // 计数器 +1
|
||||
|
||||
// 礼貌模式抓取(遵守 robots.txt + 限流),超时 10 秒,不限制大小
|
||||
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
|
||||
if err != nil || res == nil {
|
||||
// 使用 sync.WaitGroup + select 实现硬超时包装器,
|
||||
// 确保即使 http.Client.Timout 被某些底层操作忽略,goroutine 也不会永久阻塞。
|
||||
fetchTimeout := 30 * time.Second
|
||||
var res *FetchResult
|
||||
var fetchErr error
|
||||
var wg sync.WaitGroup
|
||||
wg.Add(1)
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
// 礼貌模式抓取(遵守 robots.txt + 限流),不限制大小
|
||||
res, fetchErr = c.fetcher.fetchWithHistory(rawURL, true, fetchTimeout, 0)
|
||||
}()
|
||||
waitCh := make(chan struct{})
|
||||
go func() {
|
||||
wg.Wait()
|
||||
close(waitCh)
|
||||
}()
|
||||
select {
|
||||
case <-waitCh:
|
||||
// fetch 正常返回(成功或错误)
|
||||
case <-time.After(fetchTimeout + 5*time.Second):
|
||||
log.Printf("[crawler] fetch timeout: %s", rawURL)
|
||||
}
|
||||
|
||||
if fetchErr != nil || res == nil {
|
||||
c.updateSiteFailure(rawURL) // 记录失败,更新该网站成功率
|
||||
return nil
|
||||
return
|
||||
}
|
||||
|
||||
atomic.AddInt64(&c.stats.SuccessURLs, 1) // 成功计数器 +1
|
||||
@@ -334,7 +362,6 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
// 冷却结束,切换到半开,放行一个试探请求
|
||||
atomic.StoreInt32(&c.circuitState, circuitHalfOpen)
|
||||
atomic.StoreInt64(&c.circuitExpiry, now+int64(circuitCooldownSeconds))
|
||||
log.Println("[crawler] circuit: half-open, probing harvester")
|
||||
case circuitHalfOpen:
|
||||
if now < expiry {
|
||||
return // 半开冷却中,只放行第一个,其余跳过
|
||||
|
||||
Reference in New Issue
Block a user