This commit is contained in:
2026-04-08 23:35:50 +08:00
parent 422a69397a
commit 7abcca6836
6 changed files with 257 additions and 85 deletions
+22 -18
View File
@@ -50,7 +50,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
analyzer: a,
mem: make(map[string][]storage.IndexEntry),
httpCli: &http.Client{
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
},
}
// 启动定期刷盘 goroutine
@@ -60,7 +60,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
// runPeriodicFlush 每隔 FlushIntervalSeconds 秒触发一次刷盘。
func (s *Server) runPeriodicFlush() {
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds) * time.Second)
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds()) * time.Second)
defer ticker.Stop()
for range ticker.C {
s.Flush()
@@ -514,17 +514,18 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
defVal float64 // 缺省权重(词在索引中条目已满时使用)
}
tokenIndexes := make([]tokenIndex, 0, len(tokens))
maxURLsPerKey := config.MaxURLsPerKey()
for _, t := range tokens {
entries, _ := s.db.GetIndex(t)
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
if len(entries) >= config.MaxURLsPerKey {
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(maxURLsPerKey)
if len(entries) >= maxURLsPerKey {
weights := make([]float64, len(entries))
for i, e := range entries {
weights[i] = float64(e.Weight)
}
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
defVal = math.Max(1.0/10000, weights[maxURLsPerKey-1]/2)
}
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
}
@@ -576,7 +577,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
rel *= vp
}
// 反向链接繁荣加分
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight()
bad := badURL(u)
adjust := s.infoSvc.Adjust(netloc(u))
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
@@ -659,7 +660,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
repMul = 1 - (h - 0.5)
}
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight>1)加成
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
consMul := math.Pow(config.ConsecutiveKeyWeight(), float64(consecutive))
candidates[i].scoreVec[0] *= repMul * consMul
candidates[i].scoreVec[5] = repMul
candidates[i].scoreVec[8] = consMul
@@ -729,7 +730,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
snip := buildSnippet(entry)
return snip
}
if !config.UseOnlineSnippet {
if !config.UseOnlineSnippet() {
return nil
}
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
@@ -737,7 +738,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
if err != nil {
return nil
}
req.Header.Set("User-Agent", config.SpiderName)
req.Header.Set("User-Agent", config.SpiderName())
resp, err := s.httpCli.Do(req)
if err != nil || resp.StatusCode != 200 {
return nil
@@ -785,7 +786,8 @@ func languageMultiplier(si *storage.SiteInfo) float64 {
}
chinese := si.Languages["zh"] / total
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
languageWeight := config.LanguageWeight()
return 1 + chinese*languageWeight - weird*languageWeight
}
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
@@ -807,7 +809,7 @@ func timeMul(si *storage.SiteInfo, now int64) float64 {
if days > 0 {
days-- // 跳过第一天
}
return math.Pow(config.WeightDailyDecay, float64(days))
return math.Pow(config.WeightDailyDecay(), float64(days))
}
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
@@ -820,7 +822,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
if days <= 30 {
return 1.0
}
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
return math.Pow((2+config.WeightDailyDecay())/3, float64(days))
}
// badURL 返回 URL 的"劣质"评分(0~0.9)。
@@ -1193,7 +1195,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
atomic.AddInt64(&s.rowCount, 1)
}
s.memMu.Unlock()
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
go s.Flush()
}
w.Write([]byte("ok"))
@@ -1208,14 +1210,15 @@ func (s *Server) handleFlush(w http.ResponseWriter, r *http.Request) {
// lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。
func (s *Server) lowThreshold(key string) float64 {
existing, _ := s.db.GetIndex(key)
if len(existing) < config.MaxURLsPerKey {
maxURLsPerKey := config.MaxURLsPerKey()
if len(existing) < maxURLsPerKey {
return -1
}
weights := make([]float64, len(existing))
for i, e := range existing {
weights[i] = float64(e.Weight)
}
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
return nthLargest(weights, maxURLsPerKey-1) * 0.05
}
// flush 将内存中的索引批量合并写入磁盘,然后清空内存。
@@ -1269,15 +1272,16 @@ func (s *Server) flush() {
// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
existing, _ := s.db.GetIndex(key)
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey() {
return nil
}
merged := dedup(append(newEntries, existing...))
if rand.Float64() < 0.02 {
merged = dedupNormalised(merged)
}
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
maxURLsPerKey := config.MaxURLsPerKey()
if float64(len(merged)) > float64(maxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
merged = trim(merged, s.infoSvc, maxURLsPerKey, config.MaxSameDomainPerKey())
}
return merged
}