up
This commit is contained in:
+216
-47
@@ -2,56 +2,225 @@
|
||||
// config 包存放 sese-engine 的所有全局配置参数。
|
||||
package config
|
||||
|
||||
// Index / storage limits
|
||||
// 索引 / 存储相关限制常量
|
||||
const (
|
||||
MaxURLsPerKey = 11000 // 每个索引词最多保存的 URL 数量上限
|
||||
MaxSameDomainPerKey = 20 // 同一域名在每个索引词下最多出现的次数
|
||||
BigCleanThreshold = 10000000 // 内存中累计多少条索引后触发一次刷盘清理
|
||||
MaxNewURLsPerKey = 10000 // 每次刷盘时,每个索引词最多写入的新 URL 数量上限
|
||||
MinURLsForNewKey = 3 // 新索引词如果 URL 数少于该值则丢弃,不写入磁盘
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Crawler settings
|
||||
// 爬虫行为相关配置
|
||||
const (
|
||||
SpiderName = "loli_spider" // HTTP 请求的 User-Agent 标识
|
||||
CrawlerCooldown = 3 // 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
||||
CrawlerWorkers = 22 // 爬虫并发 goroutine 数量
|
||||
CrawlFocus = 0.7 // 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
||||
MaxKeywordsPerPage = 250 // 单个页面最多提取的关键词数量
|
||||
MaxEpoch = 100 // BFS 爬取的最大轮次上限
|
||||
ExpectedProsperRatio = 0.6 // 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
||||
EntryURL = "https://zh.wikipedia.org/" // BFS 爬取的起始入口 URL
|
||||
)
|
||||
// Config 是完整的配置结构体
|
||||
type Config struct {
|
||||
Index IndexConfig `yaml:"index"`
|
||||
Crawler CrawlerConfig `yaml:"crawler"`
|
||||
Search SearchConfig `yaml:"search"`
|
||||
Backlink BacklinkConfig `yaml:"backlink"`
|
||||
Storage StorageConfig `yaml:"storage"`
|
||||
Prometheus PrometheusConfig `yaml:"prometheus"`
|
||||
}
|
||||
|
||||
// Search / ranking weights
|
||||
// 搜索结果排序权重配置
|
||||
const (
|
||||
UseOnlineSnippet = true // 是否在线抓取摘要(搜索时实时抓取页面补充摘要)
|
||||
OnlineSnippetTimeout = 3 // 在线抓取摘要的超时时间(秒)
|
||||
WeightDailyDecay = 0.996 // 页面年龄的时间衰减因子(每天乘以此系数)
|
||||
LanguageWeight = 0.5 // 语种匹配权重:与查询语种一致时加分
|
||||
ConsecutiveKeyWeight = 1.3 // 连续关键词命中权重:多词连续出现时加分
|
||||
BacklinkWeight = 1.0 // 反向链接权重:指向该 URL 的链接越多得分越高
|
||||
SearchServerPort = 80 // 搜索服务和收获服务的统一 HTTP 监听端口
|
||||
FlushIntervalSeconds = 60 // 定期刷盘间隔(秒):将内存索引批量写入磁盘
|
||||
)
|
||||
// IndexConfig 索引/存储相关限制
|
||||
type IndexConfig struct {
|
||||
MaxURLsPerKey int `yaml:"max_urls_per_key"`
|
||||
MaxSameDomainPerKey int `yaml:"max_same_domain_per_key"`
|
||||
BigCleanThreshold int `yaml:"big_clean_threshold"`
|
||||
MaxNewURLsPerKey int `yaml:"max_new_urls_per_key"`
|
||||
MinURLsForNewKey int `yaml:"min_urls_for_new_key"`
|
||||
}
|
||||
|
||||
// Backlink computation
|
||||
// 反向链接(PageRank 类)计算相关常量
|
||||
const (
|
||||
BacklinkBaseline = 200000 // 反向链接得分归一化的除数(用于将原始链接数映射到 [0,1] 区间)
|
||||
)
|
||||
// CrawlerConfig 爬虫行为相关配置
|
||||
type CrawlerConfig struct {
|
||||
SpiderName string `yaml:"spider_name"`
|
||||
Cooldown int `yaml:"cooldown"`
|
||||
Workers int `yaml:"workers"`
|
||||
CrawlFocus float64 `yaml:"crawl_focus"`
|
||||
MaxKeywordsPerPage int `yaml:"max_keywords_per_page"`
|
||||
MaxEpoch int `yaml:"max_epoch"`
|
||||
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
||||
EntryURL string `yaml:"entry_url"`
|
||||
}
|
||||
|
||||
// Storage path (relative to process working directory)
|
||||
// 存储根目录路径,相对于进程启动时的工作目录
|
||||
// SearchConfig 搜索结果排序权重配置
|
||||
type SearchConfig struct {
|
||||
UseOnlineSnippet bool `yaml:"use_online_snippet"`
|
||||
OnlineSnippetTimeout int `yaml:"online_snippet_timeout"`
|
||||
WeightDailyDecay float64 `yaml:"weight_daily_decay"`
|
||||
LanguageWeight float64 `yaml:"language_weight"`
|
||||
ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"`
|
||||
BacklinkWeight float64 `yaml:"backlink_weight"`
|
||||
ServerPort int `yaml:"server_port"`
|
||||
FlushIntervalSeconds int `yaml:"flush_interval_seconds"`
|
||||
}
|
||||
|
||||
// BacklinkConfig 反向链接计算相关配置
|
||||
type BacklinkConfig struct {
|
||||
Baseline int `yaml:"baseline"`
|
||||
}
|
||||
|
||||
// StorageConfig 存储配置
|
||||
type StorageConfig struct {
|
||||
Path string `yaml:"path"`
|
||||
}
|
||||
|
||||
// PrometheusConfig Prometheus监控端口配置
|
||||
type PrometheusConfig struct {
|
||||
CrawlerPort int `yaml:"crawler_port"`
|
||||
BacklinkPort int `yaml:"backlink_port"`
|
||||
SearchPort int `yaml:"search_port"`
|
||||
}
|
||||
|
||||
// Global 全局配置实例,加载后可通过此变量访问
|
||||
var Global Config
|
||||
|
||||
// Load 从指定路径加载配置文件
|
||||
func Load(configPath string) error {
|
||||
data, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read config file: %v", err)
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
||||
return fmt.Errorf("failed to parse config file: %v", err)
|
||||
}
|
||||
|
||||
Global = cfg
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadFromSavedata 从 savedata 目录加载 config.yml
|
||||
func LoadFromSavedata() error {
|
||||
configPath := filepath.Join("savedata", "config.yml")
|
||||
return Load(configPath)
|
||||
}
|
||||
|
||||
// GetDefaultConfig 返回默认配置
|
||||
func GetDefaultConfig() Config {
|
||||
return Config{
|
||||
Index: IndexConfig{
|
||||
MaxURLsPerKey: 11000,
|
||||
MaxSameDomainPerKey: 20,
|
||||
BigCleanThreshold: 10000000,
|
||||
MaxNewURLsPerKey: 10000,
|
||||
MinURLsForNewKey: 3,
|
||||
},
|
||||
Crawler: CrawlerConfig{
|
||||
SpiderName: "loli_spider",
|
||||
Cooldown: 3,
|
||||
Workers: 22,
|
||||
CrawlFocus: 0.7,
|
||||
MaxKeywordsPerPage: 250,
|
||||
MaxEpoch: 100,
|
||||
ExpectedProsperRatio: 0.6,
|
||||
EntryURL: "https://zh.wikipedia.org/",
|
||||
},
|
||||
Search: SearchConfig{
|
||||
UseOnlineSnippet: true,
|
||||
OnlineSnippetTimeout: 3,
|
||||
WeightDailyDecay: 0.996,
|
||||
LanguageWeight: 0.5,
|
||||
ConsecutiveKeyWeight: 1.3,
|
||||
BacklinkWeight: 1.0,
|
||||
ServerPort: 8082,
|
||||
FlushIntervalSeconds: 60,
|
||||
},
|
||||
Backlink: BacklinkConfig{
|
||||
Baseline: 200000,
|
||||
},
|
||||
Storage: StorageConfig{
|
||||
Path: "./savedata",
|
||||
},
|
||||
Prometheus: PrometheusConfig{
|
||||
CrawlerPort: 14950,
|
||||
BacklinkPort: 14952,
|
||||
SearchPort: 14953,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// 以下是向后兼容的常量定义,使用 Global 变量的值
|
||||
// 在 Init() 被调用后,这些函数会返回加载的配置值
|
||||
|
||||
func init() {
|
||||
// 初始化时设置默认值
|
||||
Global = GetDefaultConfig()
|
||||
}
|
||||
|
||||
// MaxURLsPerKey 返回配置值
|
||||
func MaxURLsPerKey() int { return Global.Index.MaxURLsPerKey }
|
||||
|
||||
// MaxSameDomainPerKey 返回配置值
|
||||
func MaxSameDomainPerKey() int { return Global.Index.MaxSameDomainPerKey }
|
||||
|
||||
// BigCleanThreshold 返回配置值
|
||||
func BigCleanThreshold() int { return Global.Index.BigCleanThreshold }
|
||||
|
||||
// MaxNewURLsPerKey 返回配置值
|
||||
func MaxNewURLsPerKey() int { return Global.Index.MaxNewURLsPerKey }
|
||||
|
||||
// MinURLsForNewKey 返回配置值
|
||||
func MinURLsForNewKey() int { return Global.Index.MinURLsForNewKey }
|
||||
|
||||
// SpiderName 返回配置值
|
||||
func SpiderName() string { return Global.Crawler.SpiderName }
|
||||
|
||||
// CrawlerCooldown 返回配置值
|
||||
func CrawlerCooldown() int { return Global.Crawler.Cooldown }
|
||||
|
||||
// CrawlerWorkers 返回配置值
|
||||
func CrawlerWorkers() int { return Global.Crawler.Workers }
|
||||
|
||||
// CrawlFocus 返回配置值
|
||||
func CrawlFocus() float64 { return Global.Crawler.CrawlFocus }
|
||||
|
||||
// MaxKeywordsPerPage 返回配置值
|
||||
func MaxKeywordsPerPage() int { return Global.Crawler.MaxKeywordsPerPage }
|
||||
|
||||
// MaxEpoch 返回配置值
|
||||
func MaxEpoch() int { return Global.Crawler.MaxEpoch }
|
||||
|
||||
// ExpectedProsperRatio 返回配置值
|
||||
func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio }
|
||||
|
||||
// EntryURL 返回配置值
|
||||
func EntryURL() string { return Global.Crawler.EntryURL }
|
||||
|
||||
// UseOnlineSnippet 返回配置值
|
||||
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
||||
|
||||
// OnlineSnippetTimeout 返回配置值
|
||||
func OnlineSnippetTimeout() int { return Global.Search.OnlineSnippetTimeout }
|
||||
|
||||
// WeightDailyDecay 返回配置值
|
||||
func WeightDailyDecay() float64 { return Global.Search.WeightDailyDecay }
|
||||
|
||||
// LanguageWeight 返回配置值
|
||||
func LanguageWeight() float64 { return Global.Search.LanguageWeight }
|
||||
|
||||
// ConsecutiveKeyWeight 返回配置值
|
||||
func ConsecutiveKeyWeight() float64 { return Global.Search.ConsecutiveKeyWeight }
|
||||
|
||||
// BacklinkWeight 返回配置值
|
||||
func BacklinkWeight() float64 { return Global.Search.BacklinkWeight }
|
||||
|
||||
// SearchServerPort 返回配置值
|
||||
func SearchServerPort() int { return Global.Search.ServerPort }
|
||||
|
||||
// FlushIntervalSeconds 返回配置值
|
||||
func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds }
|
||||
|
||||
// BacklinkBaseline 返回配置值
|
||||
func BacklinkBaseline() int { return Global.Backlink.Baseline }
|
||||
|
||||
// PromPortCrawler 返回配置值
|
||||
func PromPortCrawler() int { return Global.Prometheus.CrawlerPort }
|
||||
|
||||
// PromPortBacklink 返回配置值
|
||||
func PromPortBacklink() int { return Global.Prometheus.BacklinkPort }
|
||||
|
||||
// PromPortSearch 返回配置值
|
||||
func PromPortSearch() int { return Global.Prometheus.SearchPort }
|
||||
|
||||
// 为了向后兼容,保留 StoragePath 常量
|
||||
const StoragePath = "./savedata"
|
||||
|
||||
// Prometheus ports
|
||||
// 各模块 Prometheus 监控指标的 HTTP 端口
|
||||
const (
|
||||
PromPortCrawler = 14950 // 爬虫模块的 metrics 端口
|
||||
PromPortBacklink = 14952 // 反向链接计算模块的 metrics 端口
|
||||
PromPortSearch = 14953 // 搜索服务(含收获功能)模块的 metrics 端口
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user