fix 分词bug,添加重爬机制
This commit is contained in:
+28
-8
@@ -44,18 +44,22 @@ type CrawlerConfig struct {
|
||||
ExpectedProsperRatio float64 `yaml:"expected_prosper_ratio"`
|
||||
EntryURL string `yaml:"entry_url"`
|
||||
MaxPageSize int `yaml:"max_page_size"` // 单个页面最大抓取字节数(0=不限,默认 5MB)
|
||||
RecrawlMaxAge int `yaml:"recrawl_max_age"` // URL 过期时间(秒),超过此时间的 URL 允许被重爬,默认 30 天
|
||||
RecrawlCheckInterval int `yaml:"recrawl_check_interval"` // 运行期间检查过期 URL 的间隔(秒),默认 1 小时
|
||||
RecrawlBatchSize int `yaml:"recrawl_batch_size"` // 每次检查最多释放多少个过期 URL,默认 500
|
||||
}
|
||||
|
||||
// SearchConfig 搜索结果排序权重配置
|
||||
type SearchConfig struct {
|
||||
UseOnlineSnippet bool `yaml:"use_online_snippet"`
|
||||
OnlineSnippetTimeout int `yaml:"online_snippet_timeout"`
|
||||
WeightDailyDecay float64 `yaml:"weight_daily_decay"`
|
||||
LanguageWeight float64 `yaml:"language_weight"`
|
||||
ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"`
|
||||
BacklinkWeight float64 `yaml:"backlink_weight"`
|
||||
ServerPort int `yaml:"server_port"`
|
||||
FlushIntervalSeconds int `yaml:"flush_interval_seconds"`
|
||||
UseOnlineSnippet bool `yaml:"use_online_snippet"`
|
||||
OnlineSnippetTimeout int `yaml:"online_snippet_timeout"`
|
||||
WeightDailyDecay float64 `yaml:"weight_daily_decay"`
|
||||
LanguageWeight float64 `yaml:"language_weight"`
|
||||
ConsecutiveKeyWeight float64 `yaml:"consecutive_key_weight"`
|
||||
BacklinkWeight float64 `yaml:"backlink_weight"`
|
||||
ServerPort int `yaml:"server_port"`
|
||||
FlushIntervalSeconds int `yaml:"flush_interval_seconds"`
|
||||
MissPenalty float64 `yaml:"miss_penalty"` // 缺词惩罚系数(0=不惩罚,1=完全忽略缺词URL),默认 0.15
|
||||
}
|
||||
|
||||
// BacklinkConfig 反向链接计算相关配置
|
||||
@@ -120,6 +124,9 @@ func GetDefaultConfig() Config {
|
||||
ExpectedProsperRatio: 0.6,
|
||||
EntryURL: "https://zh.wikipedia.org/",
|
||||
MaxPageSize: 5 * 1024 * 1024,
|
||||
RecrawlMaxAge: 30 * 86400, // 30 天
|
||||
RecrawlCheckInterval: 3600, // 1 小时
|
||||
RecrawlBatchSize: 500,
|
||||
},
|
||||
Search: SearchConfig{
|
||||
UseOnlineSnippet: true,
|
||||
@@ -130,6 +137,7 @@ func GetDefaultConfig() Config {
|
||||
BacklinkWeight: 1.0,
|
||||
ServerPort: 50082,
|
||||
FlushIntervalSeconds: 300,
|
||||
MissPenalty: 0.15,
|
||||
},
|
||||
Backlink: BacklinkConfig{
|
||||
Baseline: 200000,
|
||||
@@ -208,6 +216,15 @@ func EntryURL() string { return Global.Crawler.EntryURL }
|
||||
// MaxPageSize 返回单个页面最大抓取字节数(0=不限)。
|
||||
func MaxPageSize() int { return Global.Crawler.MaxPageSize }
|
||||
|
||||
// RecrawlMaxAge 返回 URL 过期时间(秒),超过此时间的 URL 允许被重爬。
|
||||
func RecrawlMaxAge() int { return Global.Crawler.RecrawlMaxAge }
|
||||
|
||||
// RecrawlCheckInterval 返回运行期间检查过期 URL 的间隔(秒)。
|
||||
func RecrawlCheckInterval() int { return Global.Crawler.RecrawlCheckInterval }
|
||||
|
||||
// RecrawlBatchSize 返回每次检查最多释放的过期 URL 数量。
|
||||
func RecrawlBatchSize() int { return Global.Crawler.RecrawlBatchSize }
|
||||
|
||||
// UseOnlineSnippet 返回配置值
|
||||
func UseOnlineSnippet() bool { return Global.Search.UseOnlineSnippet }
|
||||
|
||||
@@ -232,6 +249,9 @@ func SearchServerPort() int { return Global.Search.ServerPort }
|
||||
// FlushIntervalSeconds 返回配置值
|
||||
func FlushIntervalSeconds() int { return Global.Search.FlushIntervalSeconds }
|
||||
|
||||
// MissPenalty 返回缺词惩罚系数(0~1),值越大对缺少查询词的 URL 惩罚越重。
|
||||
func MissPenalty() float64 { return Global.Search.MissPenalty }
|
||||
|
||||
// BacklinkBaseline 返回配置值
|
||||
func BacklinkBaseline() int { return Global.Backlink.Baseline }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user