up

2026-04-08 23:35:50 +08:00
parent 422a69397a
commit 7abcca6836
6 changed files with 257 additions and 85 deletions
@@ -60,7 +60,7 @@ type Crawler struct {
 // prosperMap 由 info 模块加载，传入域名繁荣值用于调度优先级计算。
 func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
 	return &Crawler{
-		fetcher:    NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
+		fetcher:    NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
 		db:         db,
 		analyzer:   a,
 		prosperMap: prosperMap,
@@ -124,7 +124,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
 		)

 		// 信号量：限制同时并发数不超过配置的工作线程数
-		sem := make(chan struct{}, config.CrawlerWorkers)
+		sem := make(chan struct{}, config.CrawlerWorkers())
 		for _, u := range queue {
 			wg.Add(1)
 			sem <- struct{}{} // 获取一个令牌（阻塞直到有空闲槽位）
@@ -219,8 +219,9 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
 	kws := c.analyzer.Analyze(title, desc, text)
 	if len(kws) > 0 {
 		// 限制每个页面最多发送的关键词数量
-		if len(kws) > config.MaxKeywordsPerPage {
-			kws = kws[:config.MaxKeywordsPerPage]
+		maxKws := config.MaxKeywordsPerPage()
+		if len(kws) > maxKws {
+			kws = kws[:maxKws]
 		}
 		atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
 		// 异步发送到收获服务器写入倒排索引（不阻塞爬取流程）
@@ -383,7 +384,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {

 	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
 	defer cancel()
-	req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort), bytes.NewReader(data))
+	req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort()), bytes.NewReader(data))
 	if err != nil {
 		return
 	}
@@ -454,7 +455,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 	selected := weightedSample(scored_list, k)

 	// 域名集中度过滤：限制每个域名被选中的数量，防止被少数网站垄断
-	selected = concentrationFilter(selected, config.CrawlFocus)
+	selected = concentrationFilter(selected, config.CrawlFocus())

 	// 分离 HTTPS 和 HTTP 链接，HTTP 最多占 HTTPS 的 1/4
 	var httpsURLs, httpURLs []string
@@ -480,7 +481,8 @@ func (c *Crawler) schedule(links []URLWeight) []string {
 		}
 	}
 	// 根据目标繁荣占比计算普通 URL 应保留数量
-	n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
+	expectedProsperRatio := config.ExpectedProsperRatio()
+	n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio)
 	if len(otherURLs) > n {
 		keep := max(len(otherURLs)-len(selected)/10, n)
 		if keep < len(otherURLs) {