// Package config holds all global configuration parameters for sese-engine. // config 包存放 sese-engine 的所有全局配置参数。 package config import ( "fmt" "os" "path/filepath" "gopkg.in/yaml.v3" ) // Config 是完整的配置结构体 type Config struct { Index IndexConfig `yaml:"index"` Crawler CrawlerConfig `yaml:"crawler"` Search SearchConfig `yaml:"search"` Backlink BacklinkConfig `yaml:"backlink"` Storage StorageConfig `yaml:"storage"` Prometheus PrometheusConfig `yaml:"prometheus"` } // IndexConfig 索引/存储相关限制 type IndexConfig struct { MaxURLsPerKey int `yaml:"max_urls_per_key"` MaxSameDomainPerKey int `yaml:"max_same_domain_per_key"` BigCleanThreshold int `yaml:"big_clean_threshold"` MaxNewURLsPerKey int `yaml:"max_new_urls_per_key"` MinURLsForNewKey int `yaml:"min_urls_for_new_key"` } // CrawlerConfig 爬虫行为相关配置 type CrawlerConfig struct { SpiderName string `yaml:"spider_name"` Cooldown int `yaml:"cooldown"` Workers int `yaml:"workers"` CrawlFocus float64 `yaml:"crawl_focus"` MaxKeywordsPerPage int `yaml:"max_keywords_per_page"` MaxEpoch int `yaml:"max_epoch"` ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"` EntryURL string `yaml:"entry_url"` } // SearchConfig 搜索结果排序权重配置 type SearchConfig struct { UseOnlineSnippet bool `yaml:"use_online_snippet"` OnlineSnippetTimeout int `yaml:"online_snippet_timeout"` WeightDailyDecay float64 `yaml:"weight_daily_decay"` LanguageWeight float64 `yaml:"language_weight"` ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"` BacklinkWeight float64 `yaml:"backlink_weight"` ServerPort int `yaml:"server_port"` FlushIntervalSeconds int `yaml:"flush_interval_seconds"` } // BacklinkConfig 反向链接计算相关配置 type BacklinkConfig struct { Baseline int `yaml:"baseline"` } // StorageConfig 存储配置 type StorageConfig struct { Path string `yaml:"path"` } // PrometheusConfig Prometheus监控端口配置 type PrometheusConfig struct { CrawlerPort int `yaml:"crawler_port"` BacklinkPort int `yaml:"backlink_port"` SearchPort int `yaml:"search_port"` } // Global 全局配置实例,加载后可通过此变量访问 var Global Config // Load 从指定路径加载配置文件 func Load(configPath string) error { data, err := os.ReadFile(configPath) if err != nil { return fmt.Errorf("failed to read config file: %v", err) } var cfg Config if err := yaml.Unmarshal(data, &cfg); err != nil { return fmt.Errorf("failed to parse config file: %v", err) } Global = cfg return nil } // LoadFromSavedata 从 savedata 目录加载 config.yml func LoadFromSavedata() error { configPath := filepath.Join("savedata", "config.yml") return Load(configPath) } // GetDefaultConfig 返回默认配置 func GetDefaultConfig() Config { return Config{ Index: IndexConfig{ MaxURLsPerKey: 11000, MaxSameDomainPerKey: 20, BigCleanThreshold: 10000000, MaxNewURLsPerKey: 10000, MinURLsForNewKey: 3, }, Crawler: CrawlerConfig{ SpiderName: "loli_spider", Cooldown: 3, Workers: 22, CrawlFocus: 0.7, MaxKeywordsPerPage: 250, MaxEpoch: 100, ExpectedProsperRatio: 0.6, EntryURL: "https://zh.wikipedia.org/", }, Search: SearchConfig{ UseOnlineSnippet: true, OnlineSnippetTimeout: 3, WeightDailyDecay: 0.996, LanguageWeight: 0.5, ConsecutiveKeyWeight: 1.3, BacklinkWeight: 1.0, ServerPort: 8082, FlushIntervalSeconds: 60, }, Backlink: BacklinkConfig{ Baseline: 200000, }, Storage: StorageConfig{ Path: "./savedata", }, Prometheus: PrometheusConfig{ CrawlerPort: 14950, BacklinkPort: 14952, SearchPort: 14953, }, } } // 以下是向后兼容的常量定义,使用 Global 变量的值 // 在 Init() 被调用后,这些函数会返回加载的配置值 func init() { // 初始化时设置默认值 Global = GetDefaultConfig() } // MaxURLsPerKey 返回配置值 func MaxURLsPerKey() int { return Global.Index.MaxURLsPerKey } // MaxSameDomainPerKey 返回配置值 func MaxSameDomainPerKey() int { return Global.Index.MaxSameDomainPerKey } // BigCleanThreshold 返回配置值 func BigCleanThreshold() int { return Global.Index.BigCleanThreshold } // MaxNewURLsPerKey 返回配置值 func MaxNewURLsPerKey() int { return Global.Index.MaxNewURLsPerKey } // MinURLsForNewKey 返回配置值 func MinURLsForNewKey() int { return Global.Index.MinURLsForNewKey } // SpiderName 返回配置值 func SpiderName() string { return Global.Crawler.SpiderName } // CrawlerCooldown 返回配置值 func CrawlerCooldown() int { return Global.Crawler.Cooldown } // CrawlerWorkers 返回配置值 func CrawlerWorkers() int { return Global.Crawler.Workers } // CrawlFocus 返回配置值 func CrawlFocus() float64 { return Global.Crawler.CrawlFocus } // MaxKeywordsPerPage 返回配置值 func MaxKeywordsPerPage() int { return Global.Crawler.MaxKeywordsPerPage } // MaxEpoch 返回配置值 func MaxEpoch() int { return Global.Crawler.MaxEpoch } // ExpectedProsperRatio 返回配置值 func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio } // EntryURL 返回配置值 func EntryURL() string { return Global.Crawler.EntryURL } // UseOnlineSnippet 返回配置值 func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet } // OnlineSnippetTimeout 返回配置值 func OnlineSnippetTimeout() int { return Global.Search.OnlineSnippetTimeout } // WeightDailyDecay 返回配置值 func WeightDailyDecay() float64 { return Global.Search.WeightDailyDecay } // LanguageWeight 返回配置值 func LanguageWeight() float64 { return Global.Search.LanguageWeight } // ConsecutiveKeyWeight 返回配置值 func ConsecutiveKeyWeight() float64 { return Global.Search.ConsecutiveKeyWeight } // BacklinkWeight 返回配置值 func BacklinkWeight() float64 { return Global.Search.BacklinkWeight } // SearchServerPort 返回配置值 func SearchServerPort() int { return Global.Search.ServerPort } // FlushIntervalSeconds 返回配置值 func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds } // BacklinkBaseline 返回配置值 func BacklinkBaseline() int { return Global.Backlink.Baseline } // PromPortCrawler 返回配置值 func PromPortCrawler() int { return Global.Prometheus.CrawlerPort } // PromPortBacklink 返回配置值 func PromPortBacklink() int { return Global.Prometheus.BacklinkPort } // PromPortSearch 返回配置值 func PromPortSearch() int { return Global.Prometheus.SearchPort } // 为了向后兼容,保留 StoragePath 常量 const StoragePath = "./savedata"