up
This commit is contained in:
+216
-47
@@ -2,56 +2,225 @@
|
|||||||
// config 包存放 sese-engine 的所有全局配置参数。
|
// config 包存放 sese-engine 的所有全局配置参数。
|
||||||
package config
|
package config
|
||||||
|
|
||||||
// Index / storage limits
|
import (
|
||||||
// 索引 / 存储相关限制常量
|
"fmt"
|
||||||
const (
|
"os"
|
||||||
MaxURLsPerKey = 11000 // 每个索引词最多保存的 URL 数量上限
|
"path/filepath"
|
||||||
MaxSameDomainPerKey = 20 // 同一域名在每个索引词下最多出现的次数
|
|
||||||
BigCleanThreshold = 10000000 // 内存中累计多少条索引后触发一次刷盘清理
|
"gopkg.in/yaml.v3"
|
||||||
MaxNewURLsPerKey = 10000 // 每次刷盘时,每个索引词最多写入的新 URL 数量上限
|
|
||||||
MinURLsForNewKey = 3 // 新索引词如果 URL 数少于该值则丢弃,不写入磁盘
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// Crawler settings
|
// Config 是完整的配置结构体
|
||||||
// 爬虫行为相关配置
|
type Config struct {
|
||||||
const (
|
Index IndexConfig `yaml:"index"`
|
||||||
SpiderName = "loli_spider" // HTTP 请求的 User-Agent 标识
|
Crawler CrawlerConfig `yaml:"crawler"`
|
||||||
CrawlerCooldown = 3 // 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
Search SearchConfig `yaml:"search"`
|
||||||
CrawlerWorkers = 22 // 爬虫并发 goroutine 数量
|
Backlink BacklinkConfig `yaml:"backlink"`
|
||||||
CrawlFocus = 0.7 // 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
Storage StorageConfig `yaml:"storage"`
|
||||||
MaxKeywordsPerPage = 250 // 单个页面最多提取的关键词数量
|
Prometheus PrometheusConfig `yaml:"prometheus"`
|
||||||
MaxEpoch = 100 // BFS 爬取的最大轮次上限
|
}
|
||||||
ExpectedProsperRatio = 0.6 // 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
|
||||||
EntryURL = "https://zh.wikipedia.org/" // BFS 爬取的起始入口 URL
|
|
||||||
)
|
|
||||||
|
|
||||||
// Search / ranking weights
|
// IndexConfig 索引/存储相关限制
|
||||||
// 搜索结果排序权重配置
|
type IndexConfig struct {
|
||||||
const (
|
MaxURLsPerKey int `yaml:"max_urls_per_key"`
|
||||||
UseOnlineSnippet = true // 是否在线抓取摘要(搜索时实时抓取页面补充摘要)
|
MaxSameDomainPerKey int `yaml:"max_same_domain_per_key"`
|
||||||
OnlineSnippetTimeout = 3 // 在线抓取摘要的超时时间(秒)
|
BigCleanThreshold int `yaml:"big_clean_threshold"`
|
||||||
WeightDailyDecay = 0.996 // 页面年龄的时间衰减因子(每天乘以此系数)
|
MaxNewURLsPerKey int `yaml:"max_new_urls_per_key"`
|
||||||
LanguageWeight = 0.5 // 语种匹配权重:与查询语种一致时加分
|
MinURLsForNewKey int `yaml:"min_urls_for_new_key"`
|
||||||
ConsecutiveKeyWeight = 1.3 // 连续关键词命中权重:多词连续出现时加分
|
}
|
||||||
BacklinkWeight = 1.0 // 反向链接权重:指向该 URL 的链接越多得分越高
|
|
||||||
SearchServerPort = 80 // 搜索服务和收获服务的统一 HTTP 监听端口
|
|
||||||
FlushIntervalSeconds = 60 // 定期刷盘间隔(秒):将内存索引批量写入磁盘
|
|
||||||
)
|
|
||||||
|
|
||||||
// Backlink computation
|
// CrawlerConfig 爬虫行为相关配置
|
||||||
// 反向链接(PageRank 类)计算相关常量
|
type CrawlerConfig struct {
|
||||||
const (
|
SpiderName string `yaml:"spider_name"`
|
||||||
BacklinkBaseline = 200000 // 反向链接得分归一化的除数(用于将原始链接数映射到 [0,1] 区间)
|
Cooldown int `yaml:"cooldown"`
|
||||||
)
|
Workers int `yaml:"workers"`
|
||||||
|
CrawlFocus float64 `yaml:"crawl_focus"`
|
||||||
|
MaxKeywordsPerPage int `yaml:"max_keywords_per_page"`
|
||||||
|
MaxEpoch int `yaml:"max_epoch"`
|
||||||
|
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
||||||
|
EntryURL string `yaml:"entry_url"`
|
||||||
|
}
|
||||||
|
|
||||||
// Storage path (relative to process working directory)
|
// SearchConfig 搜索结果排序权重配置
|
||||||
// 存储根目录路径,相对于进程启动时的工作目录
|
type SearchConfig struct {
|
||||||
|
UseOnlineSnippet bool `yaml:"use_online_snippet"`
|
||||||
|
OnlineSnippetTimeout int `yaml:"online_snippet_timeout"`
|
||||||
|
WeightDailyDecay float64 `yaml:"weight_daily_decay"`
|
||||||
|
LanguageWeight float64 `yaml:"language_weight"`
|
||||||
|
ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"`
|
||||||
|
BacklinkWeight float64 `yaml:"backlink_weight"`
|
||||||
|
ServerPort int `yaml:"server_port"`
|
||||||
|
FlushIntervalSeconds int `yaml:"flush_interval_seconds"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// BacklinkConfig 反向链接计算相关配置
|
||||||
|
type BacklinkConfig struct {
|
||||||
|
Baseline int `yaml:"baseline"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// StorageConfig 存储配置
|
||||||
|
type StorageConfig struct {
|
||||||
|
Path string `yaml:"path"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// PrometheusConfig Prometheus监控端口配置
|
||||||
|
type PrometheusConfig struct {
|
||||||
|
CrawlerPort int `yaml:"crawler_port"`
|
||||||
|
BacklinkPort int `yaml:"backlink_port"`
|
||||||
|
SearchPort int `yaml:"search_port"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// Global 全局配置实例,加载后可通过此变量访问
|
||||||
|
var Global Config
|
||||||
|
|
||||||
|
// Load 从指定路径加载配置文件
|
||||||
|
func Load(configPath string) error {
|
||||||
|
data, err := os.ReadFile(configPath)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to read config file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var cfg Config
|
||||||
|
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
||||||
|
return fmt.Errorf("failed to parse config file: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
Global = cfg
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// LoadFromSavedata 从 savedata 目录加载 config.yml
|
||||||
|
func LoadFromSavedata() error {
|
||||||
|
configPath := filepath.Join("savedata", "config.yml")
|
||||||
|
return Load(configPath)
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetDefaultConfig 返回默认配置
|
||||||
|
func GetDefaultConfig() Config {
|
||||||
|
return Config{
|
||||||
|
Index: IndexConfig{
|
||||||
|
MaxURLsPerKey: 11000,
|
||||||
|
MaxSameDomainPerKey: 20,
|
||||||
|
BigCleanThreshold: 10000000,
|
||||||
|
MaxNewURLsPerKey: 10000,
|
||||||
|
MinURLsForNewKey: 3,
|
||||||
|
},
|
||||||
|
Crawler: CrawlerConfig{
|
||||||
|
SpiderName: "loli_spider",
|
||||||
|
Cooldown: 3,
|
||||||
|
Workers: 22,
|
||||||
|
CrawlFocus: 0.7,
|
||||||
|
MaxKeywordsPerPage: 250,
|
||||||
|
MaxEpoch: 100,
|
||||||
|
ExpectedProsperRatio: 0.6,
|
||||||
|
EntryURL: "https://zh.wikipedia.org/",
|
||||||
|
},
|
||||||
|
Search: SearchConfig{
|
||||||
|
UseOnlineSnippet: true,
|
||||||
|
OnlineSnippetTimeout: 3,
|
||||||
|
WeightDailyDecay: 0.996,
|
||||||
|
LanguageWeight: 0.5,
|
||||||
|
ConsecutiveKeyWeight: 1.3,
|
||||||
|
BacklinkWeight: 1.0,
|
||||||
|
ServerPort: 8082,
|
||||||
|
FlushIntervalSeconds: 60,
|
||||||
|
},
|
||||||
|
Backlink: BacklinkConfig{
|
||||||
|
Baseline: 200000,
|
||||||
|
},
|
||||||
|
Storage: StorageConfig{
|
||||||
|
Path: "./savedata",
|
||||||
|
},
|
||||||
|
Prometheus: PrometheusConfig{
|
||||||
|
CrawlerPort: 14950,
|
||||||
|
BacklinkPort: 14952,
|
||||||
|
SearchPort: 14953,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 以下是向后兼容的常量定义,使用 Global 变量的值
|
||||||
|
// 在 Init() 被调用后,这些函数会返回加载的配置值
|
||||||
|
|
||||||
|
func init() {
|
||||||
|
// 初始化时设置默认值
|
||||||
|
Global = GetDefaultConfig()
|
||||||
|
}
|
||||||
|
|
||||||
|
// MaxURLsPerKey 返回配置值
|
||||||
|
func MaxURLsPerKey() int { return Global.Index.MaxURLsPerKey }
|
||||||
|
|
||||||
|
// MaxSameDomainPerKey 返回配置值
|
||||||
|
func MaxSameDomainPerKey() int { return Global.Index.MaxSameDomainPerKey }
|
||||||
|
|
||||||
|
// BigCleanThreshold 返回配置值
|
||||||
|
func BigCleanThreshold() int { return Global.Index.BigCleanThreshold }
|
||||||
|
|
||||||
|
// MaxNewURLsPerKey 返回配置值
|
||||||
|
func MaxNewURLsPerKey() int { return Global.Index.MaxNewURLsPerKey }
|
||||||
|
|
||||||
|
// MinURLsForNewKey 返回配置值
|
||||||
|
func MinURLsForNewKey() int { return Global.Index.MinURLsForNewKey }
|
||||||
|
|
||||||
|
// SpiderName 返回配置值
|
||||||
|
func SpiderName() string { return Global.Crawler.SpiderName }
|
||||||
|
|
||||||
|
// CrawlerCooldown 返回配置值
|
||||||
|
func CrawlerCooldown() int { return Global.Crawler.Cooldown }
|
||||||
|
|
||||||
|
// CrawlerWorkers 返回配置值
|
||||||
|
func CrawlerWorkers() int { return Global.Crawler.Workers }
|
||||||
|
|
||||||
|
// CrawlFocus 返回配置值
|
||||||
|
func CrawlFocus() float64 { return Global.Crawler.CrawlFocus }
|
||||||
|
|
||||||
|
// MaxKeywordsPerPage 返回配置值
|
||||||
|
func MaxKeywordsPerPage() int { return Global.Crawler.MaxKeywordsPerPage }
|
||||||
|
|
||||||
|
// MaxEpoch 返回配置值
|
||||||
|
func MaxEpoch() int { return Global.Crawler.MaxEpoch }
|
||||||
|
|
||||||
|
// ExpectedProsperRatio 返回配置值
|
||||||
|
func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio }
|
||||||
|
|
||||||
|
// EntryURL 返回配置值
|
||||||
|
func EntryURL() string { return Global.Crawler.EntryURL }
|
||||||
|
|
||||||
|
// UseOnlineSnippet 返回配置值
|
||||||
|
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
||||||
|
|
||||||
|
// OnlineSnippetTimeout 返回配置值
|
||||||
|
func OnlineSnippetTimeout() int { return Global.Search.OnlineSnippetTimeout }
|
||||||
|
|
||||||
|
// WeightDailyDecay 返回配置值
|
||||||
|
func WeightDailyDecay() float64 { return Global.Search.WeightDailyDecay }
|
||||||
|
|
||||||
|
// LanguageWeight 返回配置值
|
||||||
|
func LanguageWeight() float64 { return Global.Search.LanguageWeight }
|
||||||
|
|
||||||
|
// ConsecutiveKeyWeight 返回配置值
|
||||||
|
func ConsecutiveKeyWeight() float64 { return Global.Search.ConsecutiveKeyWeight }
|
||||||
|
|
||||||
|
// BacklinkWeight 返回配置值
|
||||||
|
func BacklinkWeight() float64 { return Global.Search.BacklinkWeight }
|
||||||
|
|
||||||
|
// SearchServerPort 返回配置值
|
||||||
|
func SearchServerPort() int { return Global.Search.ServerPort }
|
||||||
|
|
||||||
|
// FlushIntervalSeconds 返回配置值
|
||||||
|
func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds }
|
||||||
|
|
||||||
|
// BacklinkBaseline 返回配置值
|
||||||
|
func BacklinkBaseline() int { return Global.Backlink.Baseline }
|
||||||
|
|
||||||
|
// PromPortCrawler 返回配置值
|
||||||
|
func PromPortCrawler() int { return Global.Prometheus.CrawlerPort }
|
||||||
|
|
||||||
|
// PromPortBacklink 返回配置值
|
||||||
|
func PromPortBacklink() int { return Global.Prometheus.BacklinkPort }
|
||||||
|
|
||||||
|
// PromPortSearch 返回配置值
|
||||||
|
func PromPortSearch() int { return Global.Prometheus.SearchPort }
|
||||||
|
|
||||||
|
// 为了向后兼容,保留 StoragePath 常量
|
||||||
const StoragePath = "./savedata"
|
const StoragePath = "./savedata"
|
||||||
|
|
||||||
// Prometheus ports
|
|
||||||
// 各模块 Prometheus 监控指标的 HTTP 端口
|
|
||||||
const (
|
|
||||||
PromPortCrawler = 14950 // 爬虫模块的 metrics 端口
|
|
||||||
PromPortBacklink = 14952 // 反向链接计算模块的 metrics 端口
|
|
||||||
PromPortSearch = 14953 // 搜索服务(含收获功能)模块的 metrics 端口
|
|
||||||
)
|
|
||||||
|
|||||||
+9
-7
@@ -60,7 +60,7 @@ type Crawler struct {
|
|||||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||||
return &Crawler{
|
return &Crawler{
|
||||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||||
db: db,
|
db: db,
|
||||||
analyzer: a,
|
analyzer: a,
|
||||||
prosperMap: prosperMap,
|
prosperMap: prosperMap,
|
||||||
@@ -124,7 +124,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
)
|
)
|
||||||
|
|
||||||
// 信号量:限制同时并发数不超过配置的工作线程数
|
// 信号量:限制同时并发数不超过配置的工作线程数
|
||||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
sem := make(chan struct{}, config.CrawlerWorkers())
|
||||||
for _, u := range queue {
|
for _, u := range queue {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||||
@@ -219,8 +219,9 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
|||||||
kws := c.analyzer.Analyze(title, desc, text)
|
kws := c.analyzer.Analyze(title, desc, text)
|
||||||
if len(kws) > 0 {
|
if len(kws) > 0 {
|
||||||
// 限制每个页面最多发送的关键词数量
|
// 限制每个页面最多发送的关键词数量
|
||||||
if len(kws) > config.MaxKeywordsPerPage {
|
maxKws := config.MaxKeywordsPerPage()
|
||||||
kws = kws[:config.MaxKeywordsPerPage]
|
if len(kws) > maxKws {
|
||||||
|
kws = kws[:maxKws]
|
||||||
}
|
}
|
||||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||||
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||||
@@ -383,7 +384,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
|||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort), bytes.NewReader(data))
|
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort()), bytes.NewReader(data))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
@@ -454,7 +455,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
|||||||
selected := weightedSample(scored_list, k)
|
selected := weightedSample(scored_list, k)
|
||||||
|
|
||||||
// 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断
|
// 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断
|
||||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
selected = concentrationFilter(selected, config.CrawlFocus())
|
||||||
|
|
||||||
// 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4
|
// 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4
|
||||||
var httpsURLs, httpURLs []string
|
var httpsURLs, httpURLs []string
|
||||||
@@ -480,7 +481,8 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// 根据目标繁荣占比计算普通 URL 应保留数量
|
// 根据目标繁荣占比计算普通 URL 应保留数量
|
||||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
expectedProsperRatio := config.ExpectedProsperRatio()
|
||||||
|
n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio)
|
||||||
if len(otherURLs) > n {
|
if len(otherURLs) > n {
|
||||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||||
if keep < len(otherURLs) {
|
if keep < len(otherURLs) {
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ require (
|
|||||||
github.com/yanyiwu/gojieba v1.4.4
|
github.com/yanyiwu/gojieba v1.4.4
|
||||||
go.etcd.io/bbolt v1.3.9
|
go.etcd.io/bbolt v1.3.9
|
||||||
golang.org/x/net v0.23.0
|
golang.org/x/net v0.23.0
|
||||||
|
gopkg.in/yaml.v3 v3.0.1
|
||||||
)
|
)
|
||||||
|
|
||||||
require (
|
require (
|
||||||
|
|||||||
@@ -32,5 +32,7 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
|
|||||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||||
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
|
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
|
||||||
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||||
|
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||||
|
|||||||
@@ -103,16 +103,10 @@ prometheus:
|
|||||||
|
|
||||||
// loadConfig 从 savedata/config.yml 加载配置
|
// loadConfig 从 savedata/config.yml 加载配置
|
||||||
func loadConfig() error {
|
func loadConfig() error {
|
||||||
configPath := filepath.Join("savedata", "config.yml")
|
if err := config.LoadFromSavedata(); err != nil {
|
||||||
|
return fmt.Errorf("failed to load config: %v", err)
|
||||||
// 检查配置文件是否存在
|
|
||||||
if _, err := os.Stat(configPath); os.IsNotExist(err) {
|
|
||||||
return fmt.Errorf("config file not found: %s", configPath)
|
|
||||||
}
|
}
|
||||||
|
log.Printf("Config loaded successfully from savedata/config.yml")
|
||||||
// TODO: 解析 YAML 配置文件并应用到 config 包
|
|
||||||
// 这里暂时只是检查文件存在,后续可以添加 YAML 解析逻辑
|
|
||||||
log.Printf("Loading config from: %s", configPath)
|
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -130,8 +124,8 @@ func main() {
|
|||||||
// ---- 命令行参数 ----
|
// ---- 命令行参数 ----
|
||||||
// --storage:存储根目录路径,默认使用 config.StoragePath
|
// --storage:存储根目录路径,默认使用 config.StoragePath
|
||||||
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
|
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
|
||||||
// --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL(维基百科中文首页)
|
// --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL()(维基百科中文首页)
|
||||||
entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL")
|
entryURL := flag.String("entry", config.EntryURL(), "BFS crawl entry URL")
|
||||||
// --stopwords:屏蔽词 JSON 文件路径
|
// --stopwords:屏蔽词 JSON 文件路径
|
||||||
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
|
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
@@ -161,7 +155,7 @@ func main() {
|
|||||||
// ---- 4. 搜索服务器(默认 :80):对外提供搜索 API,同时内嵌收获服务(统一端口)
|
// ---- 4. 搜索服务器(默认 :80):对外提供搜索 API,同时内嵌收获服务(统一端口)
|
||||||
searchSrv := search.New(db, infoSvc, anal)
|
searchSrv := search.New(db, infoSvc, anal)
|
||||||
go func() {
|
go func() {
|
||||||
addr := fmt.Sprintf(":%d", config.SearchServerPort)
|
addr := fmt.Sprintf(":%d", config.SearchServerPort())
|
||||||
if err := searchSrv.ListenAndServe(addr); err != nil {
|
if err := searchSrv.ListenAndServe(addr); err != nil {
|
||||||
log.Fatalf("[search] fatal: %v", err)
|
log.Fatalf("[search] fatal: %v", err)
|
||||||
}
|
}
|
||||||
@@ -175,7 +169,7 @@ func main() {
|
|||||||
// 从 info 服务获取繁荣表快照,用于调度优先级决策
|
// 从 info 服务获取繁荣表快照,用于调度优先级决策
|
||||||
prosperMap := infoSvc.ProsperMap()
|
prosperMap := infoSvc.ProsperMap()
|
||||||
crawl := crawler.New(db, anal, prosperMap)
|
crawl := crawler.New(db, anal, prosperMap)
|
||||||
go crawl.Run(*entryURL, config.MaxEpoch)
|
go crawl.Run(*entryURL, config.MaxEpoch())
|
||||||
|
|
||||||
log.Println("all modules started — press Ctrl-C to stop")
|
log.Println("all modules started — press Ctrl-C to stop")
|
||||||
|
|
||||||
|
|||||||
+22
-18
@@ -50,7 +50,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
|||||||
analyzer: a,
|
analyzer: a,
|
||||||
mem: make(map[string][]storage.IndexEntry),
|
mem: make(map[string][]storage.IndexEntry),
|
||||||
httpCli: &http.Client{
|
httpCli: &http.Client{
|
||||||
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
|
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
// 启动定期刷盘 goroutine
|
// 启动定期刷盘 goroutine
|
||||||
@@ -60,7 +60,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
|||||||
|
|
||||||
// runPeriodicFlush 每隔 FlushIntervalSeconds 秒触发一次刷盘。
|
// runPeriodicFlush 每隔 FlushIntervalSeconds 秒触发一次刷盘。
|
||||||
func (s *Server) runPeriodicFlush() {
|
func (s *Server) runPeriodicFlush() {
|
||||||
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds) * time.Second)
|
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds()) * time.Second)
|
||||||
defer ticker.Stop()
|
defer ticker.Stop()
|
||||||
for range ticker.C {
|
for range ticker.C {
|
||||||
s.Flush()
|
s.Flush()
|
||||||
@@ -514,17 +514,18 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
defVal float64 // 缺省权重(词在索引中条目已满时使用)
|
defVal float64 // 缺省权重(词在索引中条目已满时使用)
|
||||||
}
|
}
|
||||||
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
||||||
|
maxURLsPerKey := config.MaxURLsPerKey()
|
||||||
for _, t := range tokens {
|
for _, t := range tokens {
|
||||||
entries, _ := s.db.GetIndex(t)
|
entries, _ := s.db.GetIndex(t)
|
||||||
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
|
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
|
||||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(maxURLsPerKey)
|
||||||
if len(entries) >= config.MaxURLsPerKey {
|
if len(entries) >= maxURLsPerKey {
|
||||||
weights := make([]float64, len(entries))
|
weights := make([]float64, len(entries))
|
||||||
for i, e := range entries {
|
for i, e := range entries {
|
||||||
weights[i] = float64(e.Weight)
|
weights[i] = float64(e.Weight)
|
||||||
}
|
}
|
||||||
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
|
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
|
||||||
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
|
defVal = math.Max(1.0/10000, weights[maxURLsPerKey-1]/2)
|
||||||
}
|
}
|
||||||
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
||||||
}
|
}
|
||||||
@@ -576,7 +577,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
rel *= vp
|
rel *= vp
|
||||||
}
|
}
|
||||||
// 反向链接繁荣加分
|
// 反向链接繁荣加分
|
||||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight()
|
||||||
bad := badURL(u)
|
bad := badURL(u)
|
||||||
adjust := s.infoSvc.Adjust(netloc(u))
|
adjust := s.infoSvc.Adjust(netloc(u))
|
||||||
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
|
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
|
||||||
@@ -659,7 +660,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
repMul = 1 - (h - 0.5)
|
repMul = 1 - (h - 0.5)
|
||||||
}
|
}
|
||||||
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成
|
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成
|
||||||
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
consMul := math.Pow(config.ConsecutiveKeyWeight(), float64(consecutive))
|
||||||
candidates[i].scoreVec[0] *= repMul * consMul
|
candidates[i].scoreVec[0] *= repMul * consMul
|
||||||
candidates[i].scoreVec[5] = repMul
|
candidates[i].scoreVec[5] = repMul
|
||||||
candidates[i].scoreVec[8] = consMul
|
candidates[i].scoreVec[8] = consMul
|
||||||
@@ -729,7 +730,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
|||||||
snip := buildSnippet(entry)
|
snip := buildSnippet(entry)
|
||||||
return snip
|
return snip
|
||||||
}
|
}
|
||||||
if !config.UseOnlineSnippet {
|
if !config.UseOnlineSnippet() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
|
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
|
||||||
@@ -737,7 +738,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
req.Header.Set("User-Agent", config.SpiderName)
|
req.Header.Set("User-Agent", config.SpiderName())
|
||||||
resp, err := s.httpCli.Do(req)
|
resp, err := s.httpCli.Do(req)
|
||||||
if err != nil || resp.StatusCode != 200 {
|
if err != nil || resp.StatusCode != 200 {
|
||||||
return nil
|
return nil
|
||||||
@@ -785,7 +786,8 @@ func languageMultiplier(si *storage.SiteInfo) float64 {
|
|||||||
}
|
}
|
||||||
chinese := si.Languages["zh"] / total
|
chinese := si.Languages["zh"] / total
|
||||||
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
|
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
|
||||||
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
languageWeight := config.LanguageWeight()
|
||||||
|
return 1 + chinese*languageWeight - weird*languageWeight
|
||||||
}
|
}
|
||||||
|
|
||||||
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
|
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
|
||||||
@@ -807,7 +809,7 @@ func timeMul(si *storage.SiteInfo, now int64) float64 {
|
|||||||
if days > 0 {
|
if days > 0 {
|
||||||
days-- // 跳过第一天
|
days-- // 跳过第一天
|
||||||
}
|
}
|
||||||
return math.Pow(config.WeightDailyDecay, float64(days))
|
return math.Pow(config.WeightDailyDecay(), float64(days))
|
||||||
}
|
}
|
||||||
|
|
||||||
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
||||||
@@ -820,7 +822,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
|||||||
if days <= 30 {
|
if days <= 30 {
|
||||||
return 1.0
|
return 1.0
|
||||||
}
|
}
|
||||||
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
return math.Pow((2+config.WeightDailyDecay())/3, float64(days))
|
||||||
}
|
}
|
||||||
|
|
||||||
// badURL 返回 URL 的"劣质"评分(0~0.9)。
|
// badURL 返回 URL 的"劣质"评分(0~0.9)。
|
||||||
@@ -1193,7 +1195,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
|||||||
atomic.AddInt64(&s.rowCount, 1)
|
atomic.AddInt64(&s.rowCount, 1)
|
||||||
}
|
}
|
||||||
s.memMu.Unlock()
|
s.memMu.Unlock()
|
||||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
|
||||||
go s.Flush()
|
go s.Flush()
|
||||||
}
|
}
|
||||||
w.Write([]byte("ok"))
|
w.Write([]byte("ok"))
|
||||||
@@ -1208,14 +1210,15 @@ func (s *Server) handleFlush(w http.ResponseWriter, r *http.Request) {
|
|||||||
// lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。
|
// lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。
|
||||||
func (s *Server) lowThreshold(key string) float64 {
|
func (s *Server) lowThreshold(key string) float64 {
|
||||||
existing, _ := s.db.GetIndex(key)
|
existing, _ := s.db.GetIndex(key)
|
||||||
if len(existing) < config.MaxURLsPerKey {
|
maxURLsPerKey := config.MaxURLsPerKey()
|
||||||
|
if len(existing) < maxURLsPerKey {
|
||||||
return -1
|
return -1
|
||||||
}
|
}
|
||||||
weights := make([]float64, len(existing))
|
weights := make([]float64, len(existing))
|
||||||
for i, e := range existing {
|
for i, e := range existing {
|
||||||
weights[i] = float64(e.Weight)
|
weights[i] = float64(e.Weight)
|
||||||
}
|
}
|
||||||
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
return nthLargest(weights, maxURLsPerKey-1) * 0.05
|
||||||
}
|
}
|
||||||
|
|
||||||
// flush 将内存中的索引批量合并写入磁盘,然后清空内存。
|
// flush 将内存中的索引批量合并写入磁盘,然后清空内存。
|
||||||
@@ -1269,15 +1272,16 @@ func (s *Server) flush() {
|
|||||||
// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
|
// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
|
||||||
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
||||||
existing, _ := s.db.GetIndex(key)
|
existing, _ := s.db.GetIndex(key)
|
||||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
merged := dedup(append(newEntries, existing...))
|
merged := dedup(append(newEntries, existing...))
|
||||||
if rand.Float64() < 0.02 {
|
if rand.Float64() < 0.02 {
|
||||||
merged = dedupNormalised(merged)
|
merged = dedupNormalised(merged)
|
||||||
}
|
}
|
||||||
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
maxURLsPerKey := config.MaxURLsPerKey()
|
||||||
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
if float64(len(merged)) > float64(maxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||||
|
merged = trim(merged, s.infoSvc, maxURLsPerKey, config.MaxSameDomainPerKey())
|
||||||
}
|
}
|
||||||
return merged
|
return merged
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user