fix 分页bug,加入手动刷盘
This commit is contained in:
+78
-14
@@ -5,6 +5,7 @@ package search
|
||||
import (
|
||||
"container/heap" // 堆结构(域名交错排序)
|
||||
"encoding/json" // JSON 序列化(响应输出)
|
||||
"io" // 代理响应复制
|
||||
"log" // 日志
|
||||
"math" // 数学运算(Log、幂)
|
||||
"net/http" // HTTP 服务端
|
||||
@@ -25,18 +26,21 @@ import (
|
||||
|
||||
// Server 是搜索 HTTP 服务器。
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
infoSvc *info.Service
|
||||
analyzer *analyzer.Analyzer
|
||||
httpCli *http.Client // 在线摘要抓取(无 robots.txt 检查)
|
||||
db *storage.DB
|
||||
infoSvc *info.Service
|
||||
analyzer *analyzer.Analyzer
|
||||
httpCli *http.Client // 在线摘要抓取(无 robots.txt 检查)
|
||||
harvesterURL string // 收获服务器地址(如 "http://localhost:5000")
|
||||
}
|
||||
|
||||
// New 创建一个 search Server。
|
||||
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
// harvesterURL 为收获服务器的地址,用于代理刷盘和状态查询。
|
||||
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer, harvesterURL string) *Server {
|
||||
return &Server{
|
||||
db: db,
|
||||
infoSvc: infoSvc,
|
||||
analyzer: a,
|
||||
db: db,
|
||||
infoSvc: infoSvc,
|
||||
analyzer: a,
|
||||
harvesterURL: harvesterURL,
|
||||
httpCli: &http.Client{
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
|
||||
},
|
||||
@@ -50,6 +54,7 @@ func (s *Server) Handler() http.Handler {
|
||||
mux.HandleFunc("/admin/recent", s.handleAdminRecent)
|
||||
mux.HandleFunc("/admin/stats", s.handleAdminStats)
|
||||
mux.HandleFunc("/admin/priority", s.handleAdminPriority)
|
||||
mux.HandleFunc("/admin/flush", s.handleAdminFlush)
|
||||
return mux
|
||||
}
|
||||
|
||||
@@ -199,6 +204,19 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
|
||||
"domains": domainsMap,
|
||||
"languages": langsMap,
|
||||
}
|
||||
|
||||
// 从 harvester 代理获取未刷盘数据条数
|
||||
if s.harvesterURL != "" {
|
||||
req, _ := http.NewRequest(http.MethodGet, s.harvesterURL+"/admin/pending", nil)
|
||||
if proxyResp, err := s.httpCli.Do(req); err == nil {
|
||||
defer proxyResp.Body.Close()
|
||||
var pendingResp map[string]int64
|
||||
if err := json.NewDecoder(proxyResp.Body).Decode(&pendingResp); err == nil {
|
||||
resp["pending"] = pendingResp["pending"]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
json.NewEncoder(w).Encode(resp)
|
||||
}
|
||||
|
||||
@@ -282,6 +300,33 @@ func (s *Server) handleAdminPriority(w http.ResponseWriter, r *http.Request) {
|
||||
}
|
||||
}
|
||||
|
||||
// handleAdminFlush 代理到 harvester 的 /flush 接口,执行刷盘。
|
||||
func (s *Server) handleAdminFlush(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
|
||||
if r.Method != http.MethodGet && r.Method != http.MethodPost {
|
||||
http.Error(w, `{"error":"method not allowed"}`, 405)
|
||||
return
|
||||
}
|
||||
|
||||
// 代理请求到 harvester
|
||||
proxyURL := s.harvesterURL + "/flush"
|
||||
req, err := http.NewRequest(http.MethodGet, proxyURL, nil)
|
||||
if err != nil {
|
||||
http.Error(w, `{"error":"`+err.Error()+`"}`, 500)
|
||||
return
|
||||
}
|
||||
resp, err := s.httpCli.Do(req)
|
||||
if err != nil {
|
||||
http.Error(w, `{"error":"harvester unreachable: `+err.Error()+`"}`, 502)
|
||||
return
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
w.WriteHeader(resp.StatusCode)
|
||||
io.Copy(w, resp.Body)
|
||||
}
|
||||
|
||||
// ---- 搜索处理器 ----
|
||||
|
||||
// searchResponse 是搜索 API 的 JSON 响应结构。
|
||||
@@ -874,13 +919,28 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
||||
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
|
||||
}
|
||||
|
||||
// 从堆中依次弹出得分最高的条目(受域名衰减影响),直到取够
|
||||
// 从堆中依次弹出得分最高的条目(受域名衰减影响),直到取够 to 条,
|
||||
// 然后截取 [from:to] 段返回。
|
||||
var result []candidate
|
||||
for h.Len() > 0 && len(result) < to {
|
||||
item := heap.Pop(h).(rerankItem)
|
||||
if len(result) >= from {
|
||||
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
||||
for len(result) < to {
|
||||
if h.Len() == 0 {
|
||||
// 堆为空时,将所有域名剩余条目依次推入堆(每域一条)
|
||||
anyPushed := false
|
||||
for domain, items := range domainItems {
|
||||
if len(items) == 0 {
|
||||
continue
|
||||
}
|
||||
next := items[len(items)-1]
|
||||
domainItems[domain] = items[:len(items)-1]
|
||||
heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
|
||||
anyPushed = true
|
||||
}
|
||||
if !anyPushed {
|
||||
break // 所有域名都没有剩余条目,结束
|
||||
}
|
||||
}
|
||||
item := heap.Pop(h).(rerankItem)
|
||||
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
||||
domain := netloc(item.url)
|
||||
domainMul[domain] /= 8 // 该域名的下一次出现衰减到 1/8
|
||||
remaining := domainItems[domain]
|
||||
@@ -890,7 +950,11 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
||||
heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
|
||||
}
|
||||
}
|
||||
return result
|
||||
// 截取分页段
|
||||
if from >= len(result) {
|
||||
return nil
|
||||
}
|
||||
return result[from:]
|
||||
}
|
||||
|
||||
// ---- 杂项辅助函数 ----
|
||||
|
||||
Reference in New Issue
Block a user