up
This commit is contained in:
+22
-18
@@ -50,7 +50,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
analyzer: a,
|
||||
mem: make(map[string][]storage.IndexEntry),
|
||||
httpCli: &http.Client{
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout()) * time.Second,
|
||||
},
|
||||
}
|
||||
// 启动定期刷盘 goroutine
|
||||
@@ -60,7 +60,7 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
|
||||
// runPeriodicFlush 每隔 FlushIntervalSeconds 秒触发一次刷盘。
|
||||
func (s *Server) runPeriodicFlush() {
|
||||
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds) * time.Second)
|
||||
ticker := time.NewTicker(time.Duration(config.FlushIntervalSeconds()) * time.Second)
|
||||
defer ticker.Stop()
|
||||
for range ticker.C {
|
||||
s.Flush()
|
||||
@@ -514,17 +514,18 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
defVal float64 // 缺省权重(词在索引中条目已满时使用)
|
||||
}
|
||||
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
||||
maxURLsPerKey := config.MaxURLsPerKey()
|
||||
for _, t := range tokens {
|
||||
entries, _ := s.db.GetIndex(t)
|
||||
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
|
||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
||||
if len(entries) >= config.MaxURLsPerKey {
|
||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(maxURLsPerKey)
|
||||
if len(entries) >= maxURLsPerKey {
|
||||
weights := make([]float64, len(entries))
|
||||
for i, e := range entries {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
|
||||
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
|
||||
defVal = math.Max(1.0/10000, weights[maxURLsPerKey-1]/2)
|
||||
}
|
||||
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
||||
}
|
||||
@@ -576,7 +577,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
rel *= vp
|
||||
}
|
||||
// 反向链接繁荣加分
|
||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight()
|
||||
bad := badURL(u)
|
||||
adjust := s.infoSvc.Adjust(netloc(u))
|
||||
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
|
||||
@@ -659,7 +660,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
repMul = 1 - (h - 0.5)
|
||||
}
|
||||
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成
|
||||
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
||||
consMul := math.Pow(config.ConsecutiveKeyWeight(), float64(consecutive))
|
||||
candidates[i].scoreVec[0] *= repMul * consMul
|
||||
candidates[i].scoreVec[5] = repMul
|
||||
candidates[i].scoreVec[8] = consMul
|
||||
@@ -729,7 +730,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
snip := buildSnippet(entry)
|
||||
return snip
|
||||
}
|
||||
if !config.UseOnlineSnippet {
|
||||
if !config.UseOnlineSnippet() {
|
||||
return nil
|
||||
}
|
||||
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
|
||||
@@ -737,7 +738,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
req.Header.Set("User-Agent", config.SpiderName)
|
||||
req.Header.Set("User-Agent", config.SpiderName())
|
||||
resp, err := s.httpCli.Do(req)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return nil
|
||||
@@ -785,7 +786,8 @@ func languageMultiplier(si *storage.SiteInfo) float64 {
|
||||
}
|
||||
chinese := si.Languages["zh"] / total
|
||||
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
|
||||
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
||||
languageWeight := config.LanguageWeight()
|
||||
return 1 + chinese*languageWeight - weird*languageWeight
|
||||
}
|
||||
|
||||
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
|
||||
@@ -807,7 +809,7 @@ func timeMul(si *storage.SiteInfo, now int64) float64 {
|
||||
if days > 0 {
|
||||
days-- // 跳过第一天
|
||||
}
|
||||
return math.Pow(config.WeightDailyDecay, float64(days))
|
||||
return math.Pow(config.WeightDailyDecay(), float64(days))
|
||||
}
|
||||
|
||||
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
||||
@@ -820,7 +822,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||
if days <= 30 {
|
||||
return 1.0
|
||||
}
|
||||
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
||||
return math.Pow((2+config.WeightDailyDecay())/3, float64(days))
|
||||
}
|
||||
|
||||
// badURL 返回 URL 的"劣质"评分(0~0.9)。
|
||||
@@ -1193,7 +1195,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
atomic.AddInt64(&s.rowCount, 1)
|
||||
}
|
||||
s.memMu.Unlock()
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold()) {
|
||||
go s.Flush()
|
||||
}
|
||||
w.Write([]byte("ok"))
|
||||
@@ -1208,14 +1210,15 @@ func (s *Server) handleFlush(w http.ResponseWriter, r *http.Request) {
|
||||
// lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。
|
||||
func (s *Server) lowThreshold(key string) float64 {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
if len(existing) < config.MaxURLsPerKey {
|
||||
maxURLsPerKey := config.MaxURLsPerKey()
|
||||
if len(existing) < maxURLsPerKey {
|
||||
return -1
|
||||
}
|
||||
weights := make([]float64, len(existing))
|
||||
for i, e := range existing {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
||||
return nthLargest(weights, maxURLsPerKey-1) * 0.05
|
||||
}
|
||||
|
||||
// flush 将内存中的索引批量合并写入磁盘,然后清空内存。
|
||||
@@ -1269,15 +1272,16 @@ func (s *Server) flush() {
|
||||
// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
|
||||
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey() {
|
||||
return nil
|
||||
}
|
||||
merged := dedup(append(newEntries, existing...))
|
||||
if rand.Float64() < 0.02 {
|
||||
merged = dedupNormalised(merged)
|
||||
}
|
||||
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
||||
maxURLsPerKey := config.MaxURLsPerKey()
|
||||
if float64(len(merged)) > float64(maxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||
merged = trim(merged, s.infoSvc, maxURLsPerKey, config.MaxSameDomainPerKey())
|
||||
}
|
||||
return merged
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user