子链接不再被 isVisited 过滤
This commit is contained in:
+9
-12
@@ -257,19 +257,16 @@ func (c *Crawler) runPriorityWorker() {
|
|||||||
// 直接调用 visitURLUnlimited,绕过队列调度和链接数限制
|
// 直接调用 visitURLUnlimited,绕过队列调度和链接数限制
|
||||||
hrefs := c.visitURLUnlimited(rawURL)
|
hrefs := c.visitURLUnlimited(rawURL)
|
||||||
|
|
||||||
// 将子链接加入优先队列(保持优先级)
|
// 将子链接加入优先队列(保持优先级)
|
||||||
if len(hrefs) > 0 {
|
// 注意:Priority URL 的子链接强制加入队列,即使已访问过也要重新爬取
|
||||||
c.priorityChildrenMu.Lock()
|
if len(hrefs) > 0 {
|
||||||
added := 0
|
c.priorityChildrenMu.Lock()
|
||||||
for _, child := range hrefs {
|
for _, child := range hrefs {
|
||||||
if !c.isVisited(child) {
|
c.priorityChildren = append(c.priorityChildren, child)
|
||||||
c.priorityChildren = append(c.priorityChildren, child)
|
|
||||||
added++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
IncrementPriorityChildren(int64(added))
|
|
||||||
c.priorityChildrenMu.Unlock()
|
|
||||||
}
|
}
|
||||||
|
IncrementPriorityChildren(int64(len(hrefs)))
|
||||||
|
c.priorityChildrenMu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))
|
log.Printf("[crawler] priority crawl done: %s (%d child links)", rawURL, len(hrefs))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user