diff --git a/crawler/crawler.go b/crawler/crawler.go index 1d7842d..66b3dfb 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -257,19 +257,16 @@ func (c *Crawler) runPriorityWorker() { // 直接调用 visitURLUnlimited,绕过队列调度和链接数限制 hrefs := c.visitURLUnlimited(rawURL) - // 将子链接加入优先队列(保持优先级) - if len(hrefs) > 0 { - c.priorityChildrenMu.Lock() - added := 0 - for _, child := range hrefs { - if !c.isVisited(child) { - c.priorityChildren = append(c.priorityChildren, child) - added++ - } - } - IncrementPriorityChildren(int64(added)) - c.priorityChildrenMu.Unlock() + // 将子链接加入优先队列(保持优先级) + // 注意:Priority URL 的子链接强制加入队列,即使已访问过也要重新爬取 + if len(hrefs) > 0 { + c.priorityChildrenMu.Lock() + for _, child := range hrefs { + c.priorityChildren = append(c.priorityChildren, child) } + IncrementPriorityChildren(int64(len(hrefs))) + c.priorityChildrenMu.Unlock() + } log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))