加上中文注释
This commit is contained in:
+130
-90
@@ -1,35 +1,36 @@
|
||||
// Package search implements the user-facing search HTTP server.
|
||||
// search 包对外提供 HTTP 搜索服务,接收查询请求并返回按多因子排序的搜索结果。
|
||||
package search
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
"container/heap" // 堆结构(域名交错排序)
|
||||
"encoding/json" // JSON 序列化(响应输出)
|
||||
"log" // 日志
|
||||
"math" // 数学运算(Log、幂)
|
||||
"net/http" // HTTP 服务端
|
||||
"net/url" // URL 解析
|
||||
"regexp" // 正则表达式(site: 过滤语法)
|
||||
"sort" // 排序
|
||||
"strings" // 字符串操作
|
||||
"sync" // 互斥锁(保护并发切片写入)
|
||||
"time" // 时间戳
|
||||
|
||||
"sese-engine/analyzer"
|
||||
"sese-engine/config"
|
||||
"sese-engine/info"
|
||||
"sese-engine/parser"
|
||||
"sese-engine/storage"
|
||||
"sese-engine/analyzer" // 分词和语种检测
|
||||
"sese-engine/config" // 排序权重配置
|
||||
"sese-engine/info" // info 服务
|
||||
"sese-engine/parser" // HTML 解析(在线摘要)
|
||||
"sese-engine/storage" // 持久化存储
|
||||
)
|
||||
|
||||
// Server is the search HTTP server.
|
||||
// Server 是搜索 HTTP 服务器。
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
infoSvc *info.Service
|
||||
analyzer *analyzer.Analyzer
|
||||
httpCli *http.Client // for online snippet fetching
|
||||
httpCli *http.Client // 在线摘要抓取(无 robots.txt 检查)
|
||||
}
|
||||
|
||||
// New creates a search Server.
|
||||
// New 创建一个 search Server。
|
||||
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
return &Server{
|
||||
db: db,
|
||||
@@ -41,51 +42,59 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
}
|
||||
}
|
||||
|
||||
// Handler returns the http.Handler.
|
||||
// Handler 返回 HTTP 路由处理器。
|
||||
func (s *Server) Handler() http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/search", s.handleSearch)
|
||||
return mux
|
||||
}
|
||||
|
||||
// ListenAndServe starts the search server.
|
||||
// ListenAndServe 启动搜索服务器。
|
||||
func (s *Server) ListenAndServe(addr string) error {
|
||||
log.Printf("[search] listening on %s", addr)
|
||||
return http.ListenAndServe(addr, s.Handler())
|
||||
}
|
||||
|
||||
// ---- search handler ----
|
||||
// ---- 搜索处理器 ----
|
||||
|
||||
// searchResponse 是搜索 API 的 JSON 响应结构。
|
||||
type searchResponse struct {
|
||||
Tokens []string `json:"tokens"`
|
||||
Counts map[string]int `json:"counts"`
|
||||
Results []searchResult `json:"results"`
|
||||
Total int `json:"total"`
|
||||
Tokens []string `json:"tokens"` // 查询的分词结果
|
||||
Counts map[string]int `json:"counts"` // 每个词在索引中出现的 URL 数量
|
||||
Results []searchResult `json:"results"` // 排序后的搜索结果列表
|
||||
Total int `json:"total"` // 符合 site: 过滤条件前的总候选数
|
||||
}
|
||||
|
||||
// searchResult 是单条搜索结果。
|
||||
type searchResult struct {
|
||||
Score float64 `json:"score"`
|
||||
URL string `json:"url"`
|
||||
Snippet *snippetInfo `json:"snippet,omitempty"`
|
||||
Relevance map[string]float64 `json:"relevance"`
|
||||
DomainCount int `json:"domain_count"`
|
||||
Factors map[string]float64 `json:"factors,omitempty"`
|
||||
Score float64 `json:"score"` // 综合排序分数
|
||||
URL string `json:"url"` // 页面 URL
|
||||
Snippet *snippetInfo `json:"snippet,omitempty"` // 摘要信息(标题/描述/正文)
|
||||
Relevance map[string]float64 `json:"relevance"` // 每个关键词在该 URL 下的权重
|
||||
DomainCount int `json:"domain_count"` // 该 URL 所属域名的总候选数
|
||||
Factors map[string]float64 `json:"factors,omitempty"` // 各排序因子的详细分数
|
||||
}
|
||||
|
||||
// snippetInfo 封装页面摘要的标题、描述和正文片段。
|
||||
type snippetInfo struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Text string `json:"text"`
|
||||
Title string `json:"title"` // 页面标题
|
||||
Description string `json:"description"` // meta description
|
||||
Text string `json:"text"` // 正文前 256 字符
|
||||
}
|
||||
|
||||
// siteRe 用于匹配 site: 过滤语法的正则(支持 site:example.com 语法)。
|
||||
var siteRe = regexp.MustCompile(`^site:(.+)$`)
|
||||
|
||||
// handleSearch 处理 GET /search 请求。
|
||||
// 参数:q(查询词),qh(URL 编码的查询词),slice(分页范围,格式 "from:to")。
|
||||
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*") // 允许跨域
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
|
||||
// 获取查询词
|
||||
q := r.URL.Query().Get("q")
|
||||
if q == "" {
|
||||
// qh:URL 编码的查询词(用于含特殊字符的查询)
|
||||
if qh := r.URL.Query().Get("qh"); qh != "" {
|
||||
decoded, err := url.PathUnescape(qh)
|
||||
if err == nil {
|
||||
@@ -94,7 +103,7 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// Parse slice param "0:10"
|
||||
// 解析分页参数(格式 "0:10")
|
||||
sliceStr := r.URL.Query().Get("slice")
|
||||
sliceFrom, sliceTo := 0, 10
|
||||
if sliceStr != "" {
|
||||
@@ -108,29 +117,30 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// Parse tokens and site filter
|
||||
// 解析查询分词,并提取 site: 过滤条件
|
||||
var tokens []string
|
||||
var siteFilter string
|
||||
for _, part := range strings.Fields(q) {
|
||||
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
|
||||
siteFilter = m[1]
|
||||
siteFilter = m[1] // site:example.com 提取目标主机名
|
||||
} else {
|
||||
segs := s.analyzer.Segment(part, false)
|
||||
for _, t := range segs {
|
||||
if !s.infoSvc.IsBlocked(t) {
|
||||
if !s.infoSvc.IsBlocked(t) { // 过滤屏蔽词
|
||||
tokens = append(tokens, t)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 最多保留 20 个词(避免查询过于宽泛)
|
||||
if len(tokens) > 20 {
|
||||
tokens = tokens[:20]
|
||||
}
|
||||
|
||||
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
|
||||
|
||||
// Count per keyword
|
||||
// 统计每个词命中的 URL 数量(供前端展示)
|
||||
counts := make(map[string]int, len(tokens))
|
||||
for _, t := range tokens {
|
||||
entries, _ := s.db.GetIndex(t)
|
||||
@@ -146,21 +156,23 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||
json.NewEncoder(w).Encode(resp)
|
||||
}
|
||||
|
||||
// query executes the multi-keyword search and returns ranked results.
|
||||
// query 执行多关键词搜索,返回排序后的结果列表。
|
||||
// 搜索流程:加载倒排索引 → 构建 URL 候选集 → 多因子评分 → 域名交错重排 → 截取分页。
|
||||
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
|
||||
if len(tokens) == 0 {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
// Load inverted index for each token
|
||||
// 加载每个词对应的倒排索引条目
|
||||
type tokenIndex struct {
|
||||
token string
|
||||
entries []storage.IndexEntry
|
||||
defVal float64
|
||||
defVal float64 // 缺省权重(词在索引中条目已满时使用)
|
||||
}
|
||||
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
||||
for _, t := range tokens {
|
||||
entries, _ := s.db.GetIndex(t)
|
||||
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
|
||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
||||
if len(entries) >= config.MaxURLsPerKey {
|
||||
weights := make([]float64, len(entries))
|
||||
@@ -173,7 +185,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
||||
}
|
||||
|
||||
// Build URL → per-token weights map
|
||||
// 构建 URL → (词 → 权重) 映射,收集所有候选 URL
|
||||
urlWeights := make(map[string]map[string]float64)
|
||||
for _, ti := range tokenIndexes {
|
||||
for _, e := range ti.entries {
|
||||
@@ -184,7 +196,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
}
|
||||
}
|
||||
|
||||
// Site filter
|
||||
// site: 过滤
|
||||
total := len(urlWeights)
|
||||
if siteFilter != "" {
|
||||
filtered := make(map[string]map[string]float64)
|
||||
@@ -198,15 +210,16 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
total = len(urlWeights)
|
||||
}
|
||||
|
||||
// Build default value map
|
||||
// 构建每个词对应的缺省权重 map
|
||||
defVals := make(map[string]float64, len(tokenIndexes))
|
||||
for _, ti := range tokenIndexes {
|
||||
defVals[ti.token] = ti.defVal
|
||||
}
|
||||
|
||||
// Compute relevance + initial score for each URL
|
||||
// 计算每个 URL 的相关性和初始分数
|
||||
candidates := make([]candidate, 0, len(urlWeights))
|
||||
for u, vs := range urlWeights {
|
||||
// 词权重相乘(贝叶斯概率近似),缺省权重填充缺失词
|
||||
rel := 1.0
|
||||
for _, ti := range tokenIndexes {
|
||||
vp := vs[ti.token]
|
||||
@@ -218,34 +231,37 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
}
|
||||
rel *= vp
|
||||
}
|
||||
// 反向链接繁荣加分
|
||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
||||
bad := badURL(u)
|
||||
adjust := s.infoSvc.Adjust(netloc(u))
|
||||
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
|
||||
score := rel * prosper * (1 - bad) * adjust * 0.1
|
||||
|
||||
// 12 维分数向量:分别记录各项因子,供后续多阶段调整
|
||||
var vec [12]float64
|
||||
vec[0] = score
|
||||
vec[1] = rel
|
||||
vec[2] = prosper
|
||||
vec[3] = 1 - bad
|
||||
vec[4] = 1 // language multiplier placeholder
|
||||
vec[5] = 1 // repetition placeholder
|
||||
vec[6] = adjust
|
||||
vec[7] = 1 // time multiplier placeholder
|
||||
vec[8] = 1 // consecutive keyword placeholder
|
||||
vec[9] = 1 // keyword content placeholder
|
||||
vec[10] = 1 // URL time placeholder
|
||||
vec[11] = 0.1
|
||||
vec[0] = score // 0: 综合分数
|
||||
vec[1] = rel // 1: 相关性
|
||||
vec[2] = prosper // 2: 繁荣值
|
||||
vec[3] = 1 - bad // 3: URL 质量
|
||||
vec[4] = 1 // 4: 语种倍数(待填充)
|
||||
vec[5] = 1 // 5: 重复惩罚(待填充)
|
||||
vec[6] = adjust // 6: 人工调整
|
||||
vec[7] = 1 // 7: 网站时间衰减(待填充)
|
||||
vec[8] = 1 // 8: 连续词加成(待填充)
|
||||
vec[9] = 1 // 9: 关键词内容(预留)
|
||||
vec[10] = 1 // 10: URL 时间衰减(待填充)
|
||||
vec[11] = 0.1 // 11: 常数因子
|
||||
|
||||
candidates = append(candidates, candidate{u, rel, vec})
|
||||
}
|
||||
|
||||
// Early relevance threshold
|
||||
// 初步排序
|
||||
sort.Slice(candidates, func(i, j int) bool {
|
||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||
})
|
||||
|
||||
// Apply site info factors to top 256
|
||||
// 阶段一:加载网站信息,计算语种倍数和时间衰减(Top 256 并发)
|
||||
now := time.Now().Unix()
|
||||
limit256 := 256
|
||||
if len(candidates) < 256 {
|
||||
@@ -264,6 +280,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
timeMul := timeMul(siteInfo, now)
|
||||
urlTimeMul := urlTimeMul(s.db, c.url, now)
|
||||
|
||||
// 更新综合分数和各项因子
|
||||
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
|
||||
c.scoreVec[4] = langMul
|
||||
c.scoreVec[7] = timeMul
|
||||
@@ -276,7 +293,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||
})
|
||||
|
||||
// Apply consecutive-keyword and repetition bonuses to top 80
|
||||
// 阶段二:连续词加成和标题重复惩罚(Top 80)
|
||||
limit80 := 80
|
||||
if len(candidates) < 80 {
|
||||
limit80 = len(candidates)
|
||||
@@ -289,7 +306,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
}
|
||||
}
|
||||
|
||||
// Repetition penaliser
|
||||
// 重复惩罚:与前序结果标题相似度过高则降权
|
||||
for i := 0; i < limit80; i++ {
|
||||
h := repetitionSimilarity(titles, i)
|
||||
consecutive := consecutiveCount(titles[i], tokens)
|
||||
@@ -297,6 +314,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
if h > 0.5 {
|
||||
repMul = 1 - (h - 0.5)
|
||||
}
|
||||
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成
|
||||
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
||||
candidates[i].scoreVec[0] *= repMul * consMul
|
||||
candidates[i].scoreVec[5] = repMul
|
||||
@@ -307,10 +325,10 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||
})
|
||||
|
||||
// Re-rank: interleave domains
|
||||
// 阶段三:域名交错重排(使结果更丰富多样)
|
||||
reranked := rerank(candidates, from, to)
|
||||
|
||||
// Fetch snippets and build output
|
||||
// 并发获取摘要
|
||||
results := make([]searchResult, 0, len(reranked))
|
||||
var snippetMu sync.Mutex
|
||||
var snippetWg sync.WaitGroup
|
||||
@@ -321,21 +339,21 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
defer snippetWg.Done()
|
||||
snip := s.getSnippet(cand.url)
|
||||
r := searchResult{
|
||||
Score: cand.scoreVec[0],
|
||||
URL: unescapeURL(cand.url),
|
||||
Score: cand.scoreVec[0],
|
||||
URL: unescapeURL(cand.url),
|
||||
Snippet: snip,
|
||||
Relevance: make(map[string]float64),
|
||||
DomainCount: 0,
|
||||
Factors: map[string]float64{
|
||||
"relevance": cand.scoreVec[1],
|
||||
"backlink": cand.scoreVec[2],
|
||||
"url_quality": cand.scoreVec[3],
|
||||
"language": cand.scoreVec[4],
|
||||
"repetition": cand.scoreVec[5],
|
||||
"adjust": cand.scoreVec[6],
|
||||
"site_time": cand.scoreVec[7],
|
||||
"consecutive": cand.scoreVec[8],
|
||||
"url_time": cand.scoreVec[10],
|
||||
"relevance": cand.scoreVec[1],
|
||||
"backlink": cand.scoreVec[2],
|
||||
"url_quality": cand.scoreVec[3],
|
||||
"language": cand.scoreVec[4],
|
||||
"repetition": cand.scoreVec[5],
|
||||
"adjust": cand.scoreVec[6],
|
||||
"site_time": cand.scoreVec[7],
|
||||
"consecutive": cand.scoreVec[8],
|
||||
"url_time": cand.scoreVec[10],
|
||||
},
|
||||
}
|
||||
for _, ti := range tokenIndexes {
|
||||
@@ -348,7 +366,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
}
|
||||
snippetWg.Wait()
|
||||
|
||||
// Preserve order (goroutines may reorder)
|
||||
// 保持 rerank 的原始顺序(并发写入打乱了顺序)
|
||||
urlOrder := make(map[string]int)
|
||||
for i, c := range reranked {
|
||||
urlOrder[c.url] = i
|
||||
@@ -360,9 +378,9 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
||||
return results, total
|
||||
}
|
||||
|
||||
// getSnippet fetches (or caches) a snippet for a URL.
|
||||
// getSnippet 获取某 URL 的摘要,优先从缓存读取,缓存未命中则在线抓取。
|
||||
func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
// Try cache first
|
||||
// 优先读缓存
|
||||
if entry, err := s.db.GetSnippet(rawURL); err == nil {
|
||||
snip := buildSnippet(entry)
|
||||
return snip
|
||||
@@ -370,7 +388,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
if !config.UseOnlineSnippet {
|
||||
return nil
|
||||
}
|
||||
// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
|
||||
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
|
||||
req, err := http.NewRequest("GET", rawURL, nil)
|
||||
if err != nil {
|
||||
return nil
|
||||
@@ -409,8 +427,10 @@ func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
|
||||
}
|
||||
}
|
||||
|
||||
// ---- scoring helpers ----
|
||||
// ---- 评分辅助函数 ----
|
||||
|
||||
// languageMultiplier 根据网站的语种分布计算语种倍数。
|
||||
// 中文占比越高且无关语言占比越低,则倍数越高(加分);否则降权。
|
||||
func languageMultiplier(si *storage.SiteInfo) float64 {
|
||||
if si == nil || len(si.Languages) == 0 {
|
||||
return 1.0
|
||||
@@ -424,27 +444,29 @@ func languageMultiplier(si *storage.SiteInfo) float64 {
|
||||
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
||||
}
|
||||
|
||||
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
|
||||
func timeMul(si *storage.SiteInfo, now int64) float64 {
|
||||
if si == nil {
|
||||
return 1.0
|
||||
}
|
||||
t := si.LastVisitTime
|
||||
if t == 0 {
|
||||
t = 1648000000
|
||||
t = 1648000000 // 默认时间戳(2022 年初)
|
||||
}
|
||||
days := (now - t) / (3600 * 24)
|
||||
if days < 0 {
|
||||
days = 0
|
||||
}
|
||||
if days > 180 {
|
||||
days = 180
|
||||
days = 180 // 最多衰减到约半年前
|
||||
}
|
||||
if days > 0 {
|
||||
days--
|
||||
days-- // 跳过第一天
|
||||
}
|
||||
return math.Pow(config.WeightDailyDecay, float64(days))
|
||||
}
|
||||
|
||||
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
||||
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||
entry, err := db.GetSnippet(rawURL)
|
||||
if err != nil || entry == nil {
|
||||
@@ -457,6 +479,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
||||
}
|
||||
|
||||
// badURL 返回 URL 的"劣质"评分(0~0.9)。
|
||||
func badURL(u string) float64 {
|
||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||
@@ -471,6 +494,7 @@ func badURL(u string) float64 {
|
||||
return math.Min(s, 0.9)
|
||||
}
|
||||
|
||||
// netloc 从 URL 提取主机名。
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
@@ -479,6 +503,7 @@ func netloc(rawURL string) string {
|
||||
return rawURL
|
||||
}
|
||||
|
||||
// matchSite 判断主机名是否匹配 site: 过滤模式(支持子域名)。
|
||||
func matchSite(host, pattern string) bool {
|
||||
if host == pattern {
|
||||
return true
|
||||
@@ -489,6 +514,7 @@ func matchSite(host, pattern string) bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// consecutiveCount 统计标题中连续词对出现的次数(用于连续词加成)。
|
||||
func consecutiveCount(title string, tokens []string) int {
|
||||
c := 0
|
||||
for i := 0; i < len(tokens)-1; i++ {
|
||||
@@ -499,6 +525,8 @@ func consecutiveCount(title string, tokens []string) int {
|
||||
return c
|
||||
}
|
||||
|
||||
// repetitionSimilarity 计算标题与前序所有标题的最大相似度(基于编辑距离)。
|
||||
// 相似度 > 0.5 的结果将被降权。
|
||||
func repetitionSimilarity(titles []string, idx int) float64 {
|
||||
if idx == 0 {
|
||||
return 0
|
||||
@@ -520,6 +548,7 @@ func repetitionSimilarity(titles []string, idx int) float64 {
|
||||
return best
|
||||
}
|
||||
|
||||
// levenshtein 计算两个字符串的编辑距离(动态规划)。
|
||||
func levenshtein(a, b string) int {
|
||||
ra := []rune(a)
|
||||
rb := []rune(b)
|
||||
@@ -562,13 +591,16 @@ func min3(a, b, c int) int {
|
||||
return c
|
||||
}
|
||||
|
||||
// rerank interleaves results from different domains.
|
||||
// ---- 域名交错重排 ----
|
||||
|
||||
// rerank 使用堆结构对候选结果按域名交错排列,使不同域名的 URL 交替出现。
|
||||
// 每个域名的第二次出现分数乘以 1/8,第三次 1/64,以此类推,确保结果多样性。
|
||||
type domainHeap []rerankItem
|
||||
|
||||
type rerankItem struct {
|
||||
score float64
|
||||
url string
|
||||
domainMul float64
|
||||
domainMul float64 // 域名衰减倍数
|
||||
vec [12]float64
|
||||
}
|
||||
|
||||
@@ -584,26 +616,29 @@ func (h *domainHeap) Pop() interface{} {
|
||||
return x
|
||||
}
|
||||
|
||||
// candidate 是候选 URL 的内部表示。
|
||||
type candidate struct {
|
||||
url string
|
||||
relevance float64
|
||||
scoreVec [12]float64
|
||||
}
|
||||
|
||||
// rerank 对候选列表进行域名交错重排,返回分页范围内的结果。
|
||||
func rerank(candidates []candidate, from, to int) []candidate {
|
||||
// 按域名分组
|
||||
domainItems := make(map[string][]candidate)
|
||||
for _, c := range candidates {
|
||||
h := netloc(c.url)
|
||||
domainItems[h] = append(domainItems[h], c)
|
||||
}
|
||||
|
||||
// 每个域名的 URL 列表取最后一个(分数最高)放入堆,其余保留
|
||||
h := &domainHeap{}
|
||||
heap.Init(h)
|
||||
domainMul := make(map[string]float64)
|
||||
|
||||
for domain, items := range domainItems {
|
||||
domainMul[domain] = 1.0
|
||||
// Sort items within domain
|
||||
sort.Slice(items, func(i, j int) bool {
|
||||
return items[i].scoreVec[0] < items[j].scoreVec[0]
|
||||
})
|
||||
@@ -612,6 +647,7 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
||||
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
|
||||
}
|
||||
|
||||
// 从堆中依次弹出得分最高的条目(受域名衰减影响),直到取够
|
||||
var result []candidate
|
||||
for h.Len() > 0 && len(result) < to {
|
||||
item := heap.Pop(h).(rerankItem)
|
||||
@@ -619,7 +655,7 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
||||
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
||||
}
|
||||
domain := netloc(item.url)
|
||||
domainMul[domain] /= 8
|
||||
domainMul[domain] /= 8 // 该域名的下一次出现衰减到 1/8
|
||||
remaining := domainItems[domain]
|
||||
if len(remaining) > 0 {
|
||||
next := remaining[len(remaining)-1]
|
||||
@@ -630,8 +666,9 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
||||
return result
|
||||
}
|
||||
|
||||
// ---- misc ----
|
||||
// ---- 杂项辅助函数 ----
|
||||
|
||||
// readBodyLimited 从 HTTP 响应体读取最多 limit 字节(用于限制在线摘要抓取大小)。
|
||||
func readBodyLimited(resp *http.Response, limit int64) string {
|
||||
data := make([]byte, 0, limit)
|
||||
buf := make([]byte, 4096)
|
||||
@@ -652,6 +689,7 @@ func readBodyLimited(resp *http.Response, limit int64) string {
|
||||
return string(data)
|
||||
}
|
||||
|
||||
// truncate 将字符串截断到最多 n 个字符。
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
@@ -659,6 +697,7 @@ func truncate(s string, n int) string {
|
||||
return s[:n]
|
||||
}
|
||||
|
||||
// unescapeURL 对 URL 进行解码(%XX 转义)。
|
||||
func unescapeURL(u string) string {
|
||||
decoded, err := url.PathUnescape(u)
|
||||
if err != nil {
|
||||
@@ -667,6 +706,7 @@ func unescapeURL(u string) string {
|
||||
return decoded
|
||||
}
|
||||
|
||||
// atoi 手写字符串转整数(不含负数和浮点)。
|
||||
func atoi(s string) int {
|
||||
n := 0
|
||||
for _, c := range s {
|
||||
|
||||
Reference in New Issue
Block a user