diff --git a/config/config.go b/config/config.go index 4325711..10762ac 100644 --- a/config/config.go +++ b/config/config.go @@ -2,56 +2,225 @@ // config 包存放 sese-engine 的所有全局配置参数。 package config -// Index / storage limits -// 索引 / 存储相关限制常量 -const ( - MaxURLsPerKey = 11000 // 每个索引词最多保存的 URL 数量上限 - MaxSameDomainPerKey = 20 // 同一域名在每个索引词下最多出现的次数 - BigCleanThreshold = 10000000 // 内存中累计多少条索引后触发一次刷盘清理 - MaxNewURLsPerKey = 10000 // 每次刷盘时,每个索引词最多写入的新 URL 数量上限 - MinURLsForNewKey = 3 // 新索引词如果 URL 数少于该值则丢弃,不写入磁盘 +import ( + "fmt" + "os" + "path/filepath" + + "gopkg.in/yaml.v3" ) -// Crawler settings -// 爬虫行为相关配置 -const ( - SpiderName = "loli_spider" // HTTP 请求的 User-Agent 标识 - CrawlerCooldown = 3 // 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封 - CrawlerWorkers = 22 // 爬虫并发 goroutine 数量 - CrawlFocus = 0.7 // 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散 - MaxKeywordsPerPage = 250 // 单个页面最多提取的关键词数量 - MaxEpoch = 100 // BFS 爬取的最大轮次上限 - ExpectedProsperRatio = 0.6 // 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策 - EntryURL = "https://zh.wikipedia.org/" // BFS 爬取的起始入口 URL -) +// Config 是完整的配置结构体 +type Config struct { + Index IndexConfig `yaml:"index"` + Crawler CrawlerConfig `yaml:"crawler"` + Search SearchConfig `yaml:"search"` + Backlink BacklinkConfig `yaml:"backlink"` + Storage StorageConfig `yaml:"storage"` + Prometheus PrometheusConfig `yaml:"prometheus"` +} -// Search / ranking weights -// 搜索结果排序权重配置 -const ( - UseOnlineSnippet = true // 是否在线抓取摘要(搜索时实时抓取页面补充摘要) - OnlineSnippetTimeout = 3 // 在线抓取摘要的超时时间(秒) - WeightDailyDecay = 0.996 // 页面年龄的时间衰减因子(每天乘以此系数) - LanguageWeight = 0.5 // 语种匹配权重:与查询语种一致时加分 - ConsecutiveKeyWeight = 1.3 // 连续关键词命中权重:多词连续出现时加分 - BacklinkWeight = 1.0 // 反向链接权重:指向该 URL 的链接越多得分越高 - SearchServerPort = 80 // 搜索服务和收获服务的统一 HTTP 监听端口 - FlushIntervalSeconds = 60 // 定期刷盘间隔(秒):将内存索引批量写入磁盘 -) +// IndexConfig 索引/存储相关限制 +type IndexConfig struct { + MaxURLsPerKey int `yaml:"max_urls_per_key"` + MaxSameDomainPerKey int `yaml:"max_same_domain_per_key"` + BigCleanThreshold int `yaml:"big_clean_threshold"` + MaxNewURLsPerKey int `yaml:"max_new_urls_per_key"` + MinURLsForNewKey int `yaml:"min_urls_for_new_key"` +} -// Backlink computation -// 反向链接(PageRank 类)计算相关常量 -const ( - BacklinkBaseline = 200000 // 反向链接得分归一化的除数(用于将原始链接数映射到 [0,1] 区间) -) +// CrawlerConfig 爬虫行为相关配置 +type CrawlerConfig struct { + SpiderName string `yaml:"spider_name"` + Cooldown int `yaml:"cooldown"` + Workers int `yaml:"workers"` + CrawlFocus float64 `yaml:"crawl_focus"` + MaxKeywordsPerPage int `yaml:"max_keywords_per_page"` + MaxEpoch int `yaml:"max_epoch"` + ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"` + EntryURL string `yaml:"entry_url"` +} -// Storage path (relative to process working directory) -// 存储根目录路径,相对于进程启动时的工作目录 +// SearchConfig 搜索结果排序权重配置 +type SearchConfig struct { + UseOnlineSnippet bool `yaml:"use_online_snippet"` + OnlineSnippetTimeout int `yaml:"online_snippet_timeout"` + WeightDailyDecay float64 `yaml:"weight_daily_decay"` + LanguageWeight float64 `yaml:"language_weight"` + ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"` + BacklinkWeight float64 `yaml:"backlink_weight"` + ServerPort int `yaml:"server_port"` + FlushIntervalSeconds int `yaml:"flush_interval_seconds"` +} + +// BacklinkConfig 反向链接计算相关配置 +type BacklinkConfig struct { + Baseline int `yaml:"baseline"` +} + +// StorageConfig 存储配置 +type StorageConfig struct { + Path string `yaml:"path"` +} + +// PrometheusConfig Prometheus监控端口配置 +type PrometheusConfig struct { + CrawlerPort int `yaml:"crawler_port"` + BacklinkPort int `yaml:"backlink_port"` + SearchPort int `yaml:"search_port"` +} + +// Global 全局配置实例,加载后可通过此变量访问 +var Global Config + +// Load 从指定路径加载配置文件 +func Load(configPath string) error { + data, err := os.ReadFile(configPath) + if err != nil { + return fmt.Errorf("failed to read config file: %v", err) + } + + var cfg Config + if err := yaml.Unmarshal(data, &cfg); err != nil { + return fmt.Errorf("failed to parse config file: %v", err) + } + + Global = cfg + return nil +} + +// LoadFromSavedata 从 savedata 目录加载 config.yml +func LoadFromSavedata() error { + configPath := filepath.Join("savedata", "config.yml") + return Load(configPath) +} + +// GetDefaultConfig 返回默认配置 +func GetDefaultConfig() Config { + return Config{ + Index: IndexConfig{ + MaxURLsPerKey: 11000, + MaxSameDomainPerKey: 20, + BigCleanThreshold: 10000000, + MaxNewURLsPerKey: 10000, + MinURLsForNewKey: 3, + }, + Crawler: CrawlerConfig{ + SpiderName: "loli_spider", + Cooldown: 3, + Workers: 22, + CrawlFocus: 0.7, + MaxKeywordsPerPage: 250, + MaxEpoch: 100, + ExpectedProsperRatio: 0.6, + EntryURL: "https://zh.wikipedia.org/", + }, + Search: SearchConfig{ + UseOnlineSnippet: true, + OnlineSnippetTimeout: 3, + WeightDailyDecay: 0.996, + LanguageWeight: 0.5, + ConsecutiveKeyWeight: 1.3, + BacklinkWeight: 1.0, + ServerPort: 8082, + FlushIntervalSeconds: 60, + }, + Backlink: BacklinkConfig{ + Baseline: 200000, + }, + Storage: StorageConfig{ + Path: "./savedata", + }, + Prometheus: PrometheusConfig{ + CrawlerPort: 14950, + BacklinkPort: 14952, + SearchPort: 14953, + }, + } +} + +// 以下是向后兼容的常量定义,使用 Global 变量的值 +// 在 Init() 被调用后,这些函数会返回加载的配置值 + +func init() { + // 初始化时设置默认值 + Global = GetDefaultConfig() +} + +// MaxURLsPerKey 返回配置值 +func MaxURLsPerKey() int { return Global.Index.MaxURLsPerKey } + +// MaxSameDomainPerKey 返回配置值 +func MaxSameDomainPerKey() int { return Global.Index.MaxSameDomainPerKey } + +// BigCleanThreshold 返回配置值 +func BigCleanThreshold() int { return Global.Index.BigCleanThreshold } + +// MaxNewURLsPerKey 返回配置值 +func MaxNewURLsPerKey() int { return Global.Index.MaxNewURLsPerKey } + +// MinURLsForNewKey 返回配置值 +func MinURLsForNewKey() int { return Global.Index.MinURLsForNewKey } + +// SpiderName 返回配置值 +func SpiderName() string { return Global.Crawler.SpiderName } + +// CrawlerCooldown 返回配置值 +func CrawlerCooldown() int { return Global.Crawler.Cooldown } + +// CrawlerWorkers 返回配置值 +func CrawlerWorkers() int { return Global.Crawler.Workers } + +// CrawlFocus 返回配置值 +func CrawlFocus() float64 { return Global.Crawler.CrawlFocus } + +// MaxKeywordsPerPage 返回配置值 +func MaxKeywordsPerPage() int { return Global.Crawler.MaxKeywordsPerPage } + +// MaxEpoch 返回配置值 +func MaxEpoch() int { return Global.Crawler.MaxEpoch } + +// ExpectedProsperRatio 返回配置值 +func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio } + +// EntryURL 返回配置值 +func EntryURL() string { return Global.Crawler.EntryURL } + +// UseOnlineSnippet 返回配置值 +func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet } + +// OnlineSnippetTimeout 返回配置值 +func OnlineSnippetTimeout() int { return Global.Search.OnlineSnippetTimeout } + +// WeightDailyDecay 返回配置值 +func WeightDailyDecay() float64 { return Global.Search.WeightDailyDecay } + +// LanguageWeight 返回配置值 +func LanguageWeight() float64 { return Global.Search.LanguageWeight } + +// ConsecutiveKeyWeight 返回配置值 +func ConsecutiveKeyWeight() float64 { return Global.Search.ConsecutiveKeyWeight } + +// BacklinkWeight 返回配置值 +func BacklinkWeight() float64 { return Global.Search.BacklinkWeight } + +// SearchServerPort 返回配置值 +func SearchServerPort() int { return Global.Search.ServerPort } + +// FlushIntervalSeconds 返回配置值 +func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds } + +// BacklinkBaseline 返回配置值 +func BacklinkBaseline() int { return Global.Backlink.Baseline } + +// PromPortCrawler 返回配置值 +func PromPortCrawler() int { return Global.Prometheus.CrawlerPort } + +// PromPortBacklink 返回配置值 +func PromPortBacklink() int { return Global.Prometheus.BacklinkPort } + +// PromPortSearch 返回配置值 +func PromPortSearch() int { return Global.Prometheus.SearchPort } + +// 为了向后兼容,保留 StoragePath 常量 const StoragePath = "./savedata" - -// Prometheus ports -// 各模块 Prometheus 监控指标的 HTTP 端口 -const ( - PromPortCrawler = 14950 // 爬虫模块的 metrics 端口 - PromPortBacklink = 14952 // 反向链接计算模块的 metrics 端口 - PromPortSearch = 14953 // 搜索服务(含收获功能)模块的 metrics 端口 -) diff --git a/crawler/crawler.go b/crawler/crawler.go index 6e2c73c..7a7bb49 100644 --- a/crawler/crawler.go +++ b/crawler/crawler.go @@ -60,7 +60,7 @@ type Crawler struct { // prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。 func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler { return &Crawler{ - fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second), + fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second), db: db, analyzer: a, prosperMap: prosperMap, @@ -124,7 +124,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) { ) // 信号量:限制同时并发数不超过配置的工作线程数 - sem := make(chan struct{}, config.CrawlerWorkers) + sem := make(chan struct{}, config.CrawlerWorkers()) for _, u := range queue { wg.Add(1) sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位) @@ -219,8 +219,9 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) { kws := c.analyzer.Analyze(title, desc, text) if len(kws) > 0 { // 限制每个页面最多发送的关键词数量 - if len(kws) > config.MaxKeywordsPerPage { - kws = kws[:config.MaxKeywordsPerPage] + maxKws := config.MaxKeywordsPerPage() + if len(kws) > maxKws { + kws = kws[:maxKws] } atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws))) // 异步发送到收获服务器写入倒排索引(不阻塞爬取流程) @@ -383,7 +384,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() - req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort), bytes.NewReader(data)) + req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort()), bytes.NewReader(data)) if err != nil { return } @@ -454,7 +455,7 @@ func (c *Crawler) schedule(links []URLWeight) []string { selected := weightedSample(scored_list, k) // 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断 - selected = concentrationFilter(selected, config.CrawlFocus) + selected = concentrationFilter(selected, config.CrawlFocus()) // 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4 var httpsURLs, httpURLs []string @@ -480,7 +481,8 @@ func (c *Crawler) schedule(links []URLWeight) []string { } } // 根据目标繁荣占比计算普通 URL 应保留数量 - n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio) + expectedProsperRatio := config.ExpectedProsperRatio() + n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio) if len(otherURLs) > n { keep := max(len(otherURLs)-len(selected)/10, n) if keep < len(otherURLs) { diff --git a/go.mod b/go.mod index 8561871..98d531d 100644 --- a/go.mod +++ b/go.mod @@ -8,6 +8,7 @@ require ( github.com/yanyiwu/gojieba v1.4.4 go.etcd.io/bbolt v1.3.9 golang.org/x/net v0.23.0 + gopkg.in/yaml.v3 v3.0.1 ) require ( diff --git a/go.sum b/go.sum index d83cb28..08983f9 100644 --- a/go.sum +++ b/go.sum @@ -32,5 +32,7 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw= google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8= google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= +gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= diff --git a/main.go b/main.go index 9adf1c4..1ebbbff 100644 --- a/main.go +++ b/main.go @@ -103,16 +103,10 @@ prometheus: // loadConfig 从 savedata/config.yml 加载配置 func loadConfig() error { - configPath := filepath.Join("savedata", "config.yml") - - // 检查配置文件是否存在 - if _, err := os.Stat(configPath); os.IsNotExist(err) { - return fmt.Errorf("config file not found: %s", configPath) + if err := config.LoadFromSavedata(); err != nil { + return fmt.Errorf("failed to load config: %v", err) } - - // TODO: 解析 YAML 配置文件并应用到 config 包 - // 这里暂时只是检查文件存在,后续可以添加 YAML 解析逻辑 - log.Printf("Loading config from: %s", configPath) + log.Printf("Config loaded successfully from savedata/config.yml") return nil } @@ -130,8 +124,8 @@ func main() { // ---- 命令行参数 ---- // --storage:存储根目录路径,默认使用 config.StoragePath storageDir := flag.String("storage", config.StoragePath, "path to savedata directory") - // --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL(维基百科中文首页) - entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL") + // --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL()(维基百科中文首页) + entryURL := flag.String("entry", config.EntryURL(), "BFS crawl entry URL") // --stopwords:屏蔽词 JSON 文件路径 stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON") flag.Parse() @@ -161,7 +155,7 @@ func main() { // ---- 4. 搜索服务器(默认 :80):对外提供搜索 API,同时内嵌收获服务(统一端口) searchSrv := search.New(db, infoSvc, anal) go func() { - addr := fmt.Sprintf(":%d", config.SearchServerPort) + addr := fmt.Sprintf(":%d", config.SearchServerPort()) if err := searchSrv.ListenAndServe(addr); err != nil { log.Fatalf("[search] fatal: %v", err) } @@ -175,7 +169,7 @@ func main() { // 从 info 服务获取繁荣表快照,用于调度优先级决策 prosperMap := infoSvc.ProsperMap() crawl := crawler.New(db, anal, prosperMap) - go crawl.Run(*entryURL, config.MaxEpoch) + go crawl.Run(*entryURL, config.MaxEpoch()) log.Println("all modules started — press Ctrl-C to stop") diff --git a/search/server.go b/search/server.go index ed76a4c..0486577 100644 --- a/search/server.go +++ b/search/server.go @@ -50,7 +50,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server { analyzer: a, mem: make(map[string][]storage.IndexEntry), httpCli: &http.Client{ - Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second, + Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second, }, } // 启动定期刷盘 goroutine @@ -60,7 +60,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server { // runPeriodicFlush 每隔 FlushIntervalSeconds 秒触发一次刷盘。 func (s *Server) runPeriodicFlush() { - ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds) * time.Second) + ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds()) * time.Second) defer ticker.Stop() for range ticker.C { s.Flush() @@ -514,17 +514,18 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear defVal float64 // 缺省权重(词在索引中条目已满时使用) } tokenIndexes := make([]tokenIndex, 0, len(tokens)) + maxURLsPerKey := config.MaxURLsPerKey() for _, t := range tokens { entries, _ := s.db.GetIndex(t) // 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重 - defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey) - if len(entries) >= config.MaxURLsPerKey { + defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(maxURLsPerKey) + if len(entries) >= maxURLsPerKey { weights := make([]float64, len(entries)) for i, e := range entries { weights[i] = float64(e.Weight) } sort.Sort(sort.Reverse(sort.Float64Slice(weights))) - defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2) + defVal = math.Max(1.0/10000, weights[maxURLsPerKey-1]/2) } tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal}) } @@ -576,7 +577,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear rel *= vp } // 反向链接繁荣加分 - prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight + prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight() bad := badURL(u) adjust := s.infoSvc.Adjust(netloc(u)) // 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整 @@ -659,7 +660,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear repMul = 1 - (h - 0.5) } // 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成 - consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive)) + consMul := math.Pow(config.ConsecutiveKeyWeight(), float64(consecutive)) candidates[i].scoreVec[0] *= repMul * consMul candidates[i].scoreVec[5] = repMul candidates[i].scoreVec[8] = consMul @@ -729,7 +730,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo { snip := buildSnippet(entry) return snip } - if !config.UseOnlineSnippet { + if !config.UseOnlineSnippet() { return nil } // 在线抓取(不使用 robots.txt,适用于搜索摘要场景) @@ -737,7 +738,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo { if err != nil { return nil } - req.Header.Set("User-Agent", config.SpiderName) + req.Header.Set("User-Agent", config.SpiderName()) resp, err := s.httpCli.Do(req) if err != nil || resp.StatusCode != 200 { return nil @@ -785,7 +786,8 @@ func languageMultiplier(si *storage.SiteInfo) float64 { } chinese := si.Languages["zh"] / total weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total - return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight + languageWeight := config.LanguageWeight() + return 1 + chinese*languageWeight - weird*languageWeight } // timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。 @@ -807,7 +809,7 @@ func timeMul(si *storage.SiteInfo, now int64) float64 { if days > 0 { days-- // 跳过第一天 } - return math.Pow(config.WeightDailyDecay, float64(days)) + return math.Pow(config.WeightDailyDecay(), float64(days)) } // urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。 @@ -820,7 +822,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 { if days <= 30 { return 1.0 } - return math.Pow((2+config.WeightDailyDecay)/3, float64(days)) + return math.Pow((2+config.WeightDailyDecay())/3, float64(days)) } // badURL 返回 URL 的"劣质"评分(0~0.9)。 @@ -1193,7 +1195,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) { atomic.AddInt64(&s.rowCount, 1) } s.memMu.Unlock() - if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) { + if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) { go s.Flush() } w.Write([]byte("ok")) @@ -1208,14 +1210,15 @@ func (s *Server) handleFlush(w http.ResponseWriter, r *http.Request) { // lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。 func (s *Server) lowThreshold(key string) float64 { existing, _ := s.db.GetIndex(key) - if len(existing) < config.MaxURLsPerKey { + maxURLsPerKey := config.MaxURLsPerKey() + if len(existing) < maxURLsPerKey { return -1 } weights := make([]float64, len(existing)) for i, e := range existing { weights[i] = float64(e.Weight) } - return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05 + return nthLargest(weights, maxURLsPerKey-1) * 0.05 } // flush 将内存中的索引批量合并写入磁盘,然后清空内存。 @@ -1269,15 +1272,16 @@ func (s *Server) flush() { // mergeKey 将新条目和磁盘已有条目合并后返回最终列表。 func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry { existing, _ := s.db.GetIndex(key) - if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey { + if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey() { return nil } merged := dedup(append(newEntries, existing...)) if rand.Float64() < 0.02 { merged = dedupNormalised(merged) } - if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 { - merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey) + maxURLsPerKey := config.MaxURLsPerKey() + if float64(len(merged)) > float64(maxURLsPerKey)*1.1 || rand.Float64() < 0.02 { + merged = trim(merged, s.infoSvc, maxURLsPerKey, config.MaxSameDomainPerKey()) } return merged }