增加搜索功能

2026-04-08 19:04:15 +08:00
parent 1d3570a505
commit 6637dff254
3 changed files with 190 additions and 7 deletions
@@ -66,6 +66,29 @@ func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *C
 	}
 }

+// fetchAndApplyPriorityURLs 从数据库读取用户插入的 priority URLs，
+// 将未访问的插入队列前端（prepend），已爬取的条目从存储中清除。
+// 返回本次插入队列的 URL 数量。
+func (c *Crawler) fetchAndApplyPriorityURLs(visited map[string]bool, queue *[]string) int {
+	entries, err := c.db.GetPriorityURLs()
+	if err != nil || len(entries) == 0 {
+		return 0
+	}
+
+	added := 0
+	for _, e := range entries {
+		if visited[e.URL] {
+			_ = c.db.RemovePriorityURL(e.URL)
+			continue
+		}
+		*queue = append([]string{e.URL}, *queue...)
+		added++
+	}
+
+	_ = c.db.ClearVisitedPriorityURLs()
+	return added
+}
+
 // URLWeight 将 URL 和发现权重打包在一起，用于调度决策。
 type URLWeight struct {
 	URL    string   // 待访问的 URL
@@ -74,12 +97,19 @@ type URLWeight struct {

 // Run 启动 BFS 爬取，从 entryURL 开始，执行最多 maxEpoch 轮。
 // 各轮之间是串行的，每轮内并发抓取，按调度算法选择下一轮 URL。
+// 每轮开始前会检查 priority 队列，优先爬取用户插入的 URL。
 func (c *Crawler) Run(entryURL string, maxEpoch int) {
 	visited := make(map[string]bool) // 已访问 URL 集合（防止重复抓取）
 	queue := []string{entryURL}      // 当前轮次的待抓取队列

 	for ep := 0; ep < maxEpoch; ep++ {
-		log.Printf("[crawler] epoch %d/%d  queue=%d", ep+1, maxEpoch, len(queue))
+		// 每轮开始前：拉取 priority URLs，插入队列前端
+		priorityAdded := c.fetchAndApplyPriorityURLs(visited, &queue)
+		if priorityAdded > 0 {
+			log.Printf("[crawler] epoch %d/%d  queue=%d (+%d priority)", ep+1, maxEpoch, len(queue), priorityAdded)
+		} else {
+			log.Printf("[crawler] epoch %d/%d  queue=%d", ep+1, maxEpoch, len(queue))
+		}
 		// 将本轮所有 URL 标记为已访问（防止下一轮重复入队）
 		for _, u := range queue {
 			visited[u] = true