up
This commit is contained in:
+9
-7
@@ -60,7 +60,7 @@ type Crawler struct {
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
return &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||
db: db,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
@@ -124,7 +124,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
)
|
||||
|
||||
// 信号量:限制同时并发数不超过配置的工作线程数
|
||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
||||
sem := make(chan struct{}, config.CrawlerWorkers())
|
||||
for _, u := range queue {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||
@@ -219,8 +219,9 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
// 限制每个页面最多发送的关键词数量
|
||||
if len(kws) > config.MaxKeywordsPerPage {
|
||||
kws = kws[:config.MaxKeywordsPerPage]
|
||||
maxKws := config.MaxKeywordsPerPage()
|
||||
if len(kws) > maxKws {
|
||||
kws = kws[:maxKws]
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||
@@ -383,7 +384,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort), bytes.NewReader(data))
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort()), bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -454,7 +455,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
selected := weightedSample(scored_list, k)
|
||||
|
||||
// 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断
|
||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
||||
selected = concentrationFilter(selected, config.CrawlFocus())
|
||||
|
||||
// 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4
|
||||
var httpsURLs, httpURLs []string
|
||||
@@ -480,7 +481,8 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
}
|
||||
}
|
||||
// 根据目标繁荣占比计算普通 URL 应保留数量
|
||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
||||
expectedProsperRatio := config.ExpectedProsperRatio()
|
||||
n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio)
|
||||
if len(otherURLs) > n {
|
||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||
if keep < len(otherURLs) {
|
||||
|
||||
Reference in New Issue
Block a user