up
This commit is contained in:
+216
-47
@@ -2,56 +2,225 @@
|
||||
// config 包存放 sese-engine 的所有全局配置参数。
|
||||
package config
|
||||
|
||||
// Index / storage limits
|
||||
// 索引 / 存储相关限制常量
|
||||
const (
|
||||
MaxURLsPerKey = 11000 // 每个索引词最多保存的 URL 数量上限
|
||||
MaxSameDomainPerKey = 20 // 同一域名在每个索引词下最多出现的次数
|
||||
BigCleanThreshold = 10000000 // 内存中累计多少条索引后触发一次刷盘清理
|
||||
MaxNewURLsPerKey = 10000 // 每次刷盘时,每个索引词最多写入的新 URL 数量上限
|
||||
MinURLsForNewKey = 3 // 新索引词如果 URL 数少于该值则丢弃,不写入磁盘
|
||||
import (
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
// Crawler settings
|
||||
// 爬虫行为相关配置
|
||||
const (
|
||||
SpiderName = "loli_spider" // HTTP 请求的 User-Agent 标识
|
||||
CrawlerCooldown = 3 // 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
||||
CrawlerWorkers = 22 // 爬虫并发 goroutine 数量
|
||||
CrawlFocus = 0.7 // 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
||||
MaxKeywordsPerPage = 250 // 单个页面最多提取的关键词数量
|
||||
MaxEpoch = 100 // BFS 爬取的最大轮次上限
|
||||
ExpectedProsperRatio = 0.6 // 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
||||
EntryURL = "https://zh.wikipedia.org/" // BFS 爬取的起始入口 URL
|
||||
)
|
||||
// Config 是完整的配置结构体
|
||||
type Config struct {
|
||||
Index IndexConfig `yaml:"index"`
|
||||
Crawler CrawlerConfig `yaml:"crawler"`
|
||||
Search SearchConfig `yaml:"search"`
|
||||
Backlink BacklinkConfig `yaml:"backlink"`
|
||||
Storage StorageConfig `yaml:"storage"`
|
||||
Prometheus PrometheusConfig `yaml:"prometheus"`
|
||||
}
|
||||
|
||||
// Search / ranking weights
|
||||
// 搜索结果排序权重配置
|
||||
const (
|
||||
UseOnlineSnippet = true // 是否在线抓取摘要(搜索时实时抓取页面补充摘要)
|
||||
OnlineSnippetTimeout = 3 // 在线抓取摘要的超时时间(秒)
|
||||
WeightDailyDecay = 0.996 // 页面年龄的时间衰减因子(每天乘以此系数)
|
||||
LanguageWeight = 0.5 // 语种匹配权重:与查询语种一致时加分
|
||||
ConsecutiveKeyWeight = 1.3 // 连续关键词命中权重:多词连续出现时加分
|
||||
BacklinkWeight = 1.0 // 反向链接权重:指向该 URL 的链接越多得分越高
|
||||
SearchServerPort = 80 // 搜索服务和收获服务的统一 HTTP 监听端口
|
||||
FlushIntervalSeconds = 60 // 定期刷盘间隔(秒):将内存索引批量写入磁盘
|
||||
)
|
||||
// IndexConfig 索引/存储相关限制
|
||||
type IndexConfig struct {
|
||||
MaxURLsPerKey int `yaml:"max_urls_per_key"`
|
||||
MaxSameDomainPerKey int `yaml:"max_same_domain_per_key"`
|
||||
BigCleanThreshold int `yaml:"big_clean_threshold"`
|
||||
MaxNewURLsPerKey int `yaml:"max_new_urls_per_key"`
|
||||
MinURLsForNewKey int `yaml:"min_urls_for_new_key"`
|
||||
}
|
||||
|
||||
// Backlink computation
|
||||
// 反向链接(PageRank 类)计算相关常量
|
||||
const (
|
||||
BacklinkBaseline = 200000 // 反向链接得分归一化的除数(用于将原始链接数映射到 [0,1] 区间)
|
||||
)
|
||||
// CrawlerConfig 爬虫行为相关配置
|
||||
type CrawlerConfig struct {
|
||||
SpiderName string `yaml:"spider_name"`
|
||||
Cooldown int `yaml:"cooldown"`
|
||||
Workers int `yaml:"workers"`
|
||||
CrawlFocus float64 `yaml:"crawl_focus"`
|
||||
MaxKeywordsPerPage int `yaml:"max_keywords_per_page"`
|
||||
MaxEpoch int `yaml:"max_epoch"`
|
||||
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
||||
EntryURL string `yaml:"entry_url"`
|
||||
}
|
||||
|
||||
// Storage path (relative to process working directory)
|
||||
// 存储根目录路径,相对于进程启动时的工作目录
|
||||
// SearchConfig 搜索结果排序权重配置
|
||||
type SearchConfig struct {
|
||||
UseOnlineSnippet bool `yaml:"use_online_snippet"`
|
||||
OnlineSnippetTimeout int `yaml:"online_snippet_timeout"`
|
||||
WeightDailyDecay float64 `yaml:"weight_daily_decay"`
|
||||
LanguageWeight float64 `yaml:"language_weight"`
|
||||
ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"`
|
||||
BacklinkWeight float64 `yaml:"backlink_weight"`
|
||||
ServerPort int `yaml:"server_port"`
|
||||
FlushIntervalSeconds int `yaml:"flush_interval_seconds"`
|
||||
}
|
||||
|
||||
// BacklinkConfig 反向链接计算相关配置
|
||||
type BacklinkConfig struct {
|
||||
Baseline int `yaml:"baseline"`
|
||||
}
|
||||
|
||||
// StorageConfig 存储配置
|
||||
type StorageConfig struct {
|
||||
Path string `yaml:"path"`
|
||||
}
|
||||
|
||||
// PrometheusConfig Prometheus监控端口配置
|
||||
type PrometheusConfig struct {
|
||||
CrawlerPort int `yaml:"crawler_port"`
|
||||
BacklinkPort int `yaml:"backlink_port"`
|
||||
SearchPort int `yaml:"search_port"`
|
||||
}
|
||||
|
||||
// Global 全局配置实例,加载后可通过此变量访问
|
||||
var Global Config
|
||||
|
||||
// Load 从指定路径加载配置文件
|
||||
func Load(configPath string) error {
|
||||
data, err := os.ReadFile(configPath)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to read config file: %v", err)
|
||||
}
|
||||
|
||||
var cfg Config
|
||||
if err := yaml.Unmarshal(data, &cfg); err != nil {
|
||||
return fmt.Errorf("failed to parse config file: %v", err)
|
||||
}
|
||||
|
||||
Global = cfg
|
||||
return nil
|
||||
}
|
||||
|
||||
// LoadFromSavedata 从 savedata 目录加载 config.yml
|
||||
func LoadFromSavedata() error {
|
||||
configPath := filepath.Join("savedata", "config.yml")
|
||||
return Load(configPath)
|
||||
}
|
||||
|
||||
// GetDefaultConfig 返回默认配置
|
||||
func GetDefaultConfig() Config {
|
||||
return Config{
|
||||
Index: IndexConfig{
|
||||
MaxURLsPerKey: 11000,
|
||||
MaxSameDomainPerKey: 20,
|
||||
BigCleanThreshold: 10000000,
|
||||
MaxNewURLsPerKey: 10000,
|
||||
MinURLsForNewKey: 3,
|
||||
},
|
||||
Crawler: CrawlerConfig{
|
||||
SpiderName: "loli_spider",
|
||||
Cooldown: 3,
|
||||
Workers: 22,
|
||||
CrawlFocus: 0.7,
|
||||
MaxKeywordsPerPage: 250,
|
||||
MaxEpoch: 100,
|
||||
ExpectedProsperRatio: 0.6,
|
||||
EntryURL: "https://zh.wikipedia.org/",
|
||||
},
|
||||
Search: SearchConfig{
|
||||
UseOnlineSnippet: true,
|
||||
OnlineSnippetTimeout: 3,
|
||||
WeightDailyDecay: 0.996,
|
||||
LanguageWeight: 0.5,
|
||||
ConsecutiveKeyWeight: 1.3,
|
||||
BacklinkWeight: 1.0,
|
||||
ServerPort: 8082,
|
||||
FlushIntervalSeconds: 60,
|
||||
},
|
||||
Backlink: BacklinkConfig{
|
||||
Baseline: 200000,
|
||||
},
|
||||
Storage: StorageConfig{
|
||||
Path: "./savedata",
|
||||
},
|
||||
Prometheus: PrometheusConfig{
|
||||
CrawlerPort: 14950,
|
||||
BacklinkPort: 14952,
|
||||
SearchPort: 14953,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// 以下是向后兼容的常量定义,使用 Global 变量的值
|
||||
// 在 Init() 被调用后,这些函数会返回加载的配置值
|
||||
|
||||
func init() {
|
||||
// 初始化时设置默认值
|
||||
Global = GetDefaultConfig()
|
||||
}
|
||||
|
||||
// MaxURLsPerKey 返回配置值
|
||||
func MaxURLsPerKey() int { return Global.Index.MaxURLsPerKey }
|
||||
|
||||
// MaxSameDomainPerKey 返回配置值
|
||||
func MaxSameDomainPerKey() int { return Global.Index.MaxSameDomainPerKey }
|
||||
|
||||
// BigCleanThreshold 返回配置值
|
||||
func BigCleanThreshold() int { return Global.Index.BigCleanThreshold }
|
||||
|
||||
// MaxNewURLsPerKey 返回配置值
|
||||
func MaxNewURLsPerKey() int { return Global.Index.MaxNewURLsPerKey }
|
||||
|
||||
// MinURLsForNewKey 返回配置值
|
||||
func MinURLsForNewKey() int { return Global.Index.MinURLsForNewKey }
|
||||
|
||||
// SpiderName 返回配置值
|
||||
func SpiderName() string { return Global.Crawler.SpiderName }
|
||||
|
||||
// CrawlerCooldown 返回配置值
|
||||
func CrawlerCooldown() int { return Global.Crawler.Cooldown }
|
||||
|
||||
// CrawlerWorkers 返回配置值
|
||||
func CrawlerWorkers() int { return Global.Crawler.Workers }
|
||||
|
||||
// CrawlFocus 返回配置值
|
||||
func CrawlFocus() float64 { return Global.Crawler.CrawlFocus }
|
||||
|
||||
// MaxKeywordsPerPage 返回配置值
|
||||
func MaxKeywordsPerPage() int { return Global.Crawler.MaxKeywordsPerPage }
|
||||
|
||||
// MaxEpoch 返回配置值
|
||||
func MaxEpoch() int { return Global.Crawler.MaxEpoch }
|
||||
|
||||
// ExpectedProsperRatio 返回配置值
|
||||
func ExpectedProsperRatio() float64 { return Global.Crawler.ExpectedProsperRatio }
|
||||
|
||||
// EntryURL 返回配置值
|
||||
func EntryURL() string { return Global.Crawler.EntryURL }
|
||||
|
||||
// UseOnlineSnippet 返回配置值
|
||||
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
||||
|
||||
// OnlineSnippetTimeout 返回配置值
|
||||
func OnlineSnippetTimeout() int { return Global.Search.OnlineSnippetTimeout }
|
||||
|
||||
// WeightDailyDecay 返回配置值
|
||||
func WeightDailyDecay() float64 { return Global.Search.WeightDailyDecay }
|
||||
|
||||
// LanguageWeight 返回配置值
|
||||
func LanguageWeight() float64 { return Global.Search.LanguageWeight }
|
||||
|
||||
// ConsecutiveKeyWeight 返回配置值
|
||||
func ConsecutiveKeyWeight() float64 { return Global.Search.ConsecutiveKeyWeight }
|
||||
|
||||
// BacklinkWeight 返回配置值
|
||||
func BacklinkWeight() float64 { return Global.Search.BacklinkWeight }
|
||||
|
||||
// SearchServerPort 返回配置值
|
||||
func SearchServerPort() int { return Global.Search.ServerPort }
|
||||
|
||||
// FlushIntervalSeconds 返回配置值
|
||||
func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds }
|
||||
|
||||
// BacklinkBaseline 返回配置值
|
||||
func BacklinkBaseline() int { return Global.Backlink.Baseline }
|
||||
|
||||
// PromPortCrawler 返回配置值
|
||||
func PromPortCrawler() int { return Global.Prometheus.CrawlerPort }
|
||||
|
||||
// PromPortBacklink 返回配置值
|
||||
func PromPortBacklink() int { return Global.Prometheus.BacklinkPort }
|
||||
|
||||
// PromPortSearch 返回配置值
|
||||
func PromPortSearch() int { return Global.Prometheus.SearchPort }
|
||||
|
||||
// 为了向后兼容,保留 StoragePath 常量
|
||||
const StoragePath = "./savedata"
|
||||
|
||||
// Prometheus ports
|
||||
// 各模块 Prometheus 监控指标的 HTTP 端口
|
||||
const (
|
||||
PromPortCrawler = 14950 // 爬虫模块的 metrics 端口
|
||||
PromPortBacklink = 14952 // 反向链接计算模块的 metrics 端口
|
||||
PromPortSearch = 14953 // 搜索服务(含收获功能)模块的 metrics 端口
|
||||
)
|
||||
|
||||
+9
-7
@@ -60,7 +60,7 @@ type Crawler struct {
|
||||
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
return &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
||||
fetcher: NewFetcher(config.SpiderName(), time.Duration(config.CrawlerCooldown())*time.Second),
|
||||
db: db,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
@@ -124,7 +124,7 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
)
|
||||
|
||||
// 信号量:限制同时并发数不超过配置的工作线程数
|
||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
||||
sem := make(chan struct{}, config.CrawlerWorkers())
|
||||
for _, u := range queue {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||
@@ -219,8 +219,9 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
// 限制每个页面最多发送的关键词数量
|
||||
if len(kws) > config.MaxKeywordsPerPage {
|
||||
kws = kws[:config.MaxKeywordsPerPage]
|
||||
maxKws := config.MaxKeywordsPerPage()
|
||||
if len(kws) > maxKws {
|
||||
kws = kws[:maxKws]
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||
@@ -383,7 +384,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
|
||||
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
||||
defer cancel()
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort), bytes.NewReader(data))
|
||||
req, err := http.NewRequestWithContext(ctx, "POST", fmt.Sprintf("http://localhost:%d/l", config.SearchServerPort()), bytes.NewReader(data))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -454,7 +455,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
selected := weightedSample(scored_list, k)
|
||||
|
||||
// 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断
|
||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
||||
selected = concentrationFilter(selected, config.CrawlFocus())
|
||||
|
||||
// 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4
|
||||
var httpsURLs, httpURLs []string
|
||||
@@ -480,7 +481,8 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
}
|
||||
}
|
||||
// 根据目标繁荣占比计算普通 URL 应保留数量
|
||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
||||
expectedProsperRatio := config.ExpectedProsperRatio()
|
||||
n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio)
|
||||
if len(otherURLs) > n {
|
||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||
if keep < len(otherURLs) {
|
||||
|
||||
@@ -8,6 +8,7 @@ require (
|
||||
github.com/yanyiwu/gojieba v1.4.4
|
||||
go.etcd.io/bbolt v1.3.9
|
||||
golang.org/x/net v0.23.0
|
||||
gopkg.in/yaml.v3 v3.0.1
|
||||
)
|
||||
|
||||
require (
|
||||
|
||||
@@ -32,5 +32,7 @@ golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8T
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
|
||||
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
|
||||
@@ -103,16 +103,10 @@ prometheus:
|
||||
|
||||
// loadConfig 从 savedata/config.yml 加载配置
|
||||
func loadConfig() error {
|
||||
configPath := filepath.Join("savedata", "config.yml")
|
||||
|
||||
// 检查配置文件是否存在
|
||||
if _, err := os.Stat(configPath); os.IsNotExist(err) {
|
||||
return fmt.Errorf("config file not found: %s", configPath)
|
||||
if err := config.LoadFromSavedata(); err != nil {
|
||||
return fmt.Errorf("failed to load config: %v", err)
|
||||
}
|
||||
|
||||
// TODO: 解析 YAML 配置文件并应用到 config 包
|
||||
// 这里暂时只是检查文件存在,后续可以添加 YAML 解析逻辑
|
||||
log.Printf("Loading config from: %s", configPath)
|
||||
log.Printf("Config loaded successfully from savedata/config.yml")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -130,8 +124,8 @@ func main() {
|
||||
// ---- 命令行参数 ----
|
||||
// --storage:存储根目录路径,默认使用 config.StoragePath
|
||||
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
|
||||
// --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL(维基百科中文首页)
|
||||
entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL")
|
||||
// --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL()(维基百科中文首页)
|
||||
entryURL := flag.String("entry", config.EntryURL(), "BFS crawl entry URL")
|
||||
// --stopwords:屏蔽词 JSON 文件路径
|
||||
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
|
||||
flag.Parse()
|
||||
@@ -161,7 +155,7 @@ func main() {
|
||||
// ---- 4. 搜索服务器(默认 :80):对外提供搜索 API,同时内嵌收获服务(统一端口)
|
||||
searchSrv := search.New(db, infoSvc, anal)
|
||||
go func() {
|
||||
addr := fmt.Sprintf(":%d", config.SearchServerPort)
|
||||
addr := fmt.Sprintf(":%d", config.SearchServerPort())
|
||||
if err := searchSrv.ListenAndServe(addr); err != nil {
|
||||
log.Fatalf("[search] fatal: %v", err)
|
||||
}
|
||||
@@ -175,7 +169,7 @@ func main() {
|
||||
// 从 info 服务获取繁荣表快照,用于调度优先级决策
|
||||
prosperMap := infoSvc.ProsperMap()
|
||||
crawl := crawler.New(db, anal, prosperMap)
|
||||
go crawl.Run(*entryURL, config.MaxEpoch)
|
||||
go crawl.Run(*entryURL, config.MaxEpoch())
|
||||
|
||||
log.Println("all modules started — press Ctrl-C to stop")
|
||||
|
||||
|
||||
+22
-18
@@ -50,7 +50,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
analyzer: a,
|
||||
mem: make(map[string][]storage.IndexEntry),
|
||||
httpCli: &http.Client{
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
|
||||
},
|
||||
}
|
||||
// 启动定期刷盘 goroutine
|
||||
@@ -60,7 +60,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
|
||||
// runPeriodicFlush 每隔 FlushIntervalSeconds 秒触发一次刷盘。
|
||||
func (s *Server) runPeriodicFlush() {
|
||||
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds) * time.Second)
|
||||
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds()) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
s.Flush()
|
||||
@@ -514,17 +514,18 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
defVal float64 // 缺省权重(词在索引中条目已满时使用)
|
||||
}
|
||||
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
||||
maxURLsPerKey := config.MaxURLsPerKey()
|
||||
for _, t := range tokens {
|
||||
entries, _ := s.db.GetIndex(t)
|
||||
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
|
||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
||||
if len(entries) >= config.MaxURLsPerKey {
|
||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(maxURLsPerKey)
|
||||
if len(entries) >= maxURLsPerKey {
|
||||
weights := make([]float64, len(entries))
|
||||
for i, e := range entries {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
|
||||
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
|
||||
defVal = math.Max(1.0/10000, weights[maxURLsPerKey-1]/2)
|
||||
}
|
||||
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
||||
}
|
||||
@@ -576,7 +577,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
rel *= vp
|
||||
}
|
||||
// 反向链接繁荣加分
|
||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight()
|
||||
bad := badURL(u)
|
||||
adjust := s.infoSvc.Adjust(netloc(u))
|
||||
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
|
||||
@@ -659,7 +660,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
repMul = 1 - (h - 0.5)
|
||||
}
|
||||
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成
|
||||
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
||||
consMul := math.Pow(config.ConsecutiveKeyWeight(), float64(consecutive))
|
||||
candidates[i].scoreVec[0] *= repMul * consMul
|
||||
candidates[i].scoreVec[5] = repMul
|
||||
candidates[i].scoreVec[8] = consMul
|
||||
@@ -729,7 +730,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
snip := buildSnippet(entry)
|
||||
return snip
|
||||
}
|
||||
if !config.UseOnlineSnippet {
|
||||
if !config.UseOnlineSnippet() {
|
||||
return nil
|
||||
}
|
||||
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
|
||||
@@ -737,7 +738,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
req.Header.Set("User-Agent", config.SpiderName)
|
||||
req.Header.Set("User-Agent", config.SpiderName())
|
||||
resp, err := s.httpCli.Do(req)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return nil
|
||||
@@ -785,7 +786,8 @@ func languageMultiplier(si *storage.SiteInfo) float64 {
|
||||
}
|
||||
chinese := si.Languages["zh"] / total
|
||||
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
|
||||
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
||||
languageWeight := config.LanguageWeight()
|
||||
return 1 + chinese*languageWeight - weird*languageWeight
|
||||
}
|
||||
|
||||
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
|
||||
@@ -807,7 +809,7 @@ func timeMul(si *storage.SiteInfo, now int64) float64 {
|
||||
if days > 0 {
|
||||
days-- // 跳过第一天
|
||||
}
|
||||
return math.Pow(config.WeightDailyDecay, float64(days))
|
||||
return math.Pow(config.WeightDailyDecay(), float64(days))
|
||||
}
|
||||
|
||||
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
||||
@@ -820,7 +822,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||
if days <= 30 {
|
||||
return 1.0
|
||||
}
|
||||
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
||||
return math.Pow((2+config.WeightDailyDecay())/3, float64(days))
|
||||
}
|
||||
|
||||
// badURL 返回 URL 的"劣质"评分(0~0.9)。
|
||||
@@ -1193,7 +1195,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt64(&s.rowCount, 1)
|
||||
}
|
||||
s.memMu.Unlock()
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
|
||||
go s.Flush()
|
||||
}
|
||||
w.Write([]byte("ok"))
|
||||
@@ -1208,14 +1210,15 @@ func (s *Server) handleFlush(w http.ResponseWriter, r *http.Request) {
|
||||
// lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。
|
||||
func (s *Server) lowThreshold(key string) float64 {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
if len(existing) < config.MaxURLsPerKey {
|
||||
maxURLsPerKey := config.MaxURLsPerKey()
|
||||
if len(existing) < maxURLsPerKey {
|
||||
return -1
|
||||
}
|
||||
weights := make([]float64, len(existing))
|
||||
for i, e := range existing {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
||||
return nthLargest(weights, maxURLsPerKey-1) * 0.05
|
||||
}
|
||||
|
||||
// flush 将内存中的索引批量合并写入磁盘,然后清空内存。
|
||||
@@ -1269,15 +1272,16 @@ func (s *Server) flush() {
|
||||
// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
|
||||
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey() {
|
||||
return nil
|
||||
}
|
||||
merged := dedup(append(newEntries, existing...))
|
||||
if rand.Float64() < 0.02 {
|
||||
merged = dedupNormalised(merged)
|
||||
}
|
||||
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
||||
maxURLsPerKey := config.MaxURLsPerKey()
|
||||
if float64(len(merged)) > float64(maxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||
merged = trim(merged, s.infoSvc, maxURLsPerKey, config.MaxSameDomainPerKey())
|
||||
}
|
||||
return merged
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user