From 1b88ca1efba20b75e25e208d79c21b8f292b225f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=90=B4=E6=96=87=E5=B3=B0?= Date: Fri, 10 Apr 2026 20:49:49 +0800 Subject: [PATCH] =?UTF-8?q?=E5=AD=90=E9=93=BE=E6=8E=A5=E4=B8=8D=E5=86=8D?= =?UTF-8?q?=E8=A2=AB=20isVisited=20=E8=BF=87=E6=BB=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- crawler/crawler.go | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/crawler/crawler.go b/crawler/crawler.go index 1d7842d..66b3dfb 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -257,19 +257,16 @@ func (c *Crawler) runPriorityWorker() { // 直接调用 visitURLUnlimited,绕过队列调度和链接数限制 hrefs := c.visitURLUnlimited(rawURL) - // 将子链接加入优先队列(保持优先级) - if len(hrefs) > 0 { - c.priorityChildrenMu.Lock() - added := 0 - for _, child := range hrefs { - if !c.isVisited(child) { - c.priorityChildren = append(c.priorityChildren, child) - added++ - } - } - IncrementPriorityChildren(int64(added)) - c.priorityChildrenMu.Unlock() + // 将子链接加入优先队列(保持优先级) + // 注意:Priority URL 的子链接强制加入队列,即使已访问过也要重新爬取 + if len(hrefs) > 0 { + c.priorityChildrenMu.Lock() + for _, child := range hrefs { + c.priorityChildren = append(c.priorityChildren, child) } + IncrementPriorityChildren(int64(len(hrefs))) + c.priorityChildrenMu.Unlock() + } log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))