加上中文注释
This commit is contained in:
+33
-25
@@ -1,53 +1,61 @@
|
||||
// Package config holds all global configuration parameters for sese-engine.
|
||||
// config 包存放 sese-engine 的所有全局配置参数。
|
||||
package config
|
||||
|
||||
// Index / storage limits
|
||||
// 索引 / 存储相关限制常量
|
||||
const (
|
||||
MaxURLsPerKey = 11000 // max URLs stored per index key
|
||||
MaxSameDomainPerKey = 20 // max URLs from the same domain per key
|
||||
BigCleanThreshold = 10000000 // flush in-memory index after this many rows
|
||||
MaxNewURLsPerKey = 10000 // cap on new URLs added per key per flush
|
||||
MinURLsForNewKey = 3 // discard new keys with fewer than this many URLs
|
||||
MaxURLsPerKey = 11000 // 每个索引词最多保存的 URL 数量上限
|
||||
MaxSameDomainPerKey = 20 // 同一域名在每个索引词下最多出现的次数
|
||||
BigCleanThreshold = 10000000 // 内存中累计多少条索引后触发一次刷盘清理
|
||||
MaxNewURLsPerKey = 10000 // 每次刷盘时,每个索引词最多写入的新 URL 数量上限
|
||||
MinURLsForNewKey = 3 // 新索引词如果 URL 数少于该值则丢弃,不写入磁盘
|
||||
)
|
||||
|
||||
// Crawler settings
|
||||
// 爬虫行为相关配置
|
||||
const (
|
||||
SpiderName = "loli_spider"
|
||||
CrawlerCooldown = 3 // seconds between requests to the same host
|
||||
CrawlerWorkers = 22 // goroutine pool size for crawling
|
||||
CrawlFocus = 0.7 // concentration factor — higher = more focused on single domain
|
||||
MaxKeywordsPerPage = 250
|
||||
MaxEpoch = 100
|
||||
ExpectedProsperRatio = 0.6 // fraction of queue that should be "prosperous" (high backlink) domains
|
||||
EntryURL = "https://zh.wikipedia.org/"
|
||||
SpiderName = "loli_spider" // HTTP 请求的 User-Agent 标识
|
||||
CrawlerCooldown = 3 // 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
||||
CrawlerWorkers = 22 // 爬虫并发 goroutine 数量
|
||||
CrawlFocus = 0.7 // 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
||||
MaxKeywordsPerPage = 250 // 单个页面最多提取的关键词数量
|
||||
MaxEpoch = 100 // BFS 爬取的最大轮次上限
|
||||
ExpectedProsperRatio = 0.6 // 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
||||
EntryURL = "https://zh.wikipedia.org/" // BFS 爬取的起始入口 URL
|
||||
)
|
||||
|
||||
// Search / ranking weights
|
||||
// 搜索结果排序权重配置
|
||||
const (
|
||||
UseOnlineSnippet = true
|
||||
OnlineSnippetTimeout = 3 // seconds
|
||||
WeightDailyDecay = 0.996
|
||||
LanguageWeight = 0.5
|
||||
ConsecutiveKeyWeight = 1.3
|
||||
BacklinkWeight = 1.0
|
||||
SearchServerPort = 80
|
||||
UseOnlineSnippet = true // 是否在线抓取摘要(搜索时实时抓取页面补充摘要)
|
||||
OnlineSnippetTimeout = 3 // 在线抓取摘要的超时时间(秒)
|
||||
WeightDailyDecay = 0.996 // 页面年龄的时间衰减因子(每天乘以此系数)
|
||||
LanguageWeight = 0.5 // 语种匹配权重:与查询语种一致时加分
|
||||
ConsecutiveKeyWeight = 1.3 // 连续关键词命中权重:多词连续出现时加分
|
||||
BacklinkWeight = 1.0 // 反向链接权重:指向该 URL 的链接越多得分越高
|
||||
SearchServerPort = 80 // 搜索服务的 HTTP 监听端口
|
||||
)
|
||||
|
||||
// Backlink computation
|
||||
// 反向链接(PageRank 类)计算相关常量
|
||||
const (
|
||||
BacklinkBaseline = 200000 // normalization divisor for backlink scores
|
||||
BacklinkBaseline = 200000 // 反向链接得分归一化的除数(用于将原始链接数映射到 [0,1] 区间)
|
||||
)
|
||||
|
||||
// Storage path (relative to process working directory)
|
||||
// 存储根目录路径,相对于进程启动时的工作目录
|
||||
const StoragePath = "./savedata"
|
||||
|
||||
// Prometheus ports
|
||||
// 各模块 Prometheus 监控指标的 HTTP 端口
|
||||
const (
|
||||
PromPortCrawler = 14950
|
||||
PromPortHarvester = 14951
|
||||
PromPortBacklink = 14952
|
||||
PromPortSearch = 14953
|
||||
PromPortCrawler = 14950 // 爬虫模块的 metrics 端口
|
||||
PromPortHarvester = 14951 // 收获服务器模块的 metrics 端口
|
||||
PromPortBacklink = 14952 // 反向链接计算模块的 metrics 端口
|
||||
PromPortSearch = 14953 // 搜索服务模块的 metrics 端口
|
||||
)
|
||||
|
||||
// Harvester HTTP endpoint
|
||||
// 爬虫向收获服务器发送索引数据的 HTTP 端点地址
|
||||
const HarvesterAddr = "http://127.0.0.1:5000"
|
||||
|
||||
Reference in New Issue
Block a user