fix 分页bug，加入手动刷盘

2026-04-08 19:56:26 +08:00
parent 6637dff254
commit d14c9caa56
3 changed files with 175 additions and 21 deletions
@@ -5,6 +5,7 @@ package search
 import (
 	"container/heap" // 堆结构（域名交错排序）
 	"encoding/json"  // JSON 序列化（响应输出）
+	"io"            // 代理响应复制
 	"log"          // 日志
 	"math"         // 数学运算（Log、幂）
 	"net/http"     // HTTP 服务端
@@ -25,18 +26,21 @@ import (

 // Server 是搜索 HTTP 服务器。
 type Server struct {
-	db       *storage.DB
-	infoSvc  *info.Service
-	analyzer *analyzer.Analyzer
-	httpCli  *http.Client // 在线摘要抓取（无 robots.txt 检查）
+	db           *storage.DB
+	infoSvc      *info.Service
+	analyzer     *analyzer.Analyzer
+	httpCli      *http.Client // 在线摘要抓取（无 robots.txt 检查）
+	harvesterURL string       // 收获服务器地址（如 "http://localhost:5000"）
 }

 // New 创建一个 search Server。
-func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
+// harvesterURL 为收获服务器的地址，用于代理刷盘和状态查询。
+func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer, harvesterURL string) *Server {
 	return &Server{
-		db:       db,
-		infoSvc:  infoSvc,
-		analyzer: a,
+		db:           db,
+		infoSvc:      infoSvc,
+		analyzer:     a,
+		harvesterURL: harvesterURL,
 		httpCli: &http.Client{
 			Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
 		},
@@ -50,6 +54,7 @@ func (s *Server) Handler() http.Handler {
 	mux.HandleFunc("/admin/recent", s.handleAdminRecent)
 	mux.HandleFunc("/admin/stats", s.handleAdminStats)
 	mux.HandleFunc("/admin/priority", s.handleAdminPriority)
+	mux.HandleFunc("/admin/flush", s.handleAdminFlush)
 	return mux
 }

@@ -199,6 +204,19 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
 		"domains":      domainsMap,
 		"languages":    langsMap,
 	}
+
+	// 从 harvester 代理获取未刷盘数据条数
+	if s.harvesterURL != "" {
+		req, _ := http.NewRequest(http.MethodGet, s.harvesterURL+"/admin/pending", nil)
+		if proxyResp, err := s.httpCli.Do(req); err == nil {
+			defer proxyResp.Body.Close()
+			var pendingResp map[string]int64
+			if err := json.NewDecoder(proxyResp.Body).Decode(&pendingResp); err == nil {
+				resp["pending"] = pendingResp["pending"]
+			}
+		}
+	}
+
 	json.NewEncoder(w).Encode(resp)
 }

@@ -282,6 +300,33 @@ func (s *Server) handleAdminPriority(w http.ResponseWriter, r *http.Request) {
 	}
 }

+// handleAdminFlush 代理到 harvester 的 /flush 接口，执行刷盘。
+func (s *Server) handleAdminFlush(w http.ResponseWriter, r *http.Request) {
+	w.Header().Set("Access-Control-Allow-Origin", "*")
+	w.Header().Set("Content-Type", "application/json; charset=utf-8")
+
+	if r.Method != http.MethodGet && r.Method != http.MethodPost {
+		http.Error(w, `{"error":"method not allowed"}`, 405)
+		return
+	}
+
+	// 代理请求到 harvester
+	proxyURL := s.harvesterURL + "/flush"
+	req, err := http.NewRequest(http.MethodGet, proxyURL, nil)
+	if err != nil {
+		http.Error(w, `{"error":"`+err.Error()+`"}`, 500)
+		return
+	}
+	resp, err := s.httpCli.Do(req)
+	if err != nil {
+		http.Error(w, `{"error":"harvester unreachable: `+err.Error()+`"}`, 502)
+		return
+	}
+	defer resp.Body.Close()
+	w.WriteHeader(resp.StatusCode)
+	io.Copy(w, resp.Body)
+}
+
 // ---- 搜索处理器 ----

 // searchResponse 是搜索 API 的 JSON 响应结构。
@@ -874,13 +919,28 @@ func rerank(candidates []candidate, from, to int) []candidate {
 		heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
 	}

-	// 从堆中依次弹出得分最高的条目（受域名衰减影响），直到取够
+	// 从堆中依次弹出得分最高的条目（受域名衰减影响），直到取够 to 条，
+	// 然后截取 [from:to] 段返回。
 	var result []candidate
-	for h.Len() > 0 && len(result) < to {
-		item := heap.Pop(h).(rerankItem)
-		if len(result) >= from {
-			result = append(result, candidate{url: item.url, scoreVec: item.vec})
+	for len(result) < to {
+		if h.Len() == 0 {
+			// 堆为空时，将所有域名剩余条目依次推入堆（每域一条）
+			anyPushed := false
+			for domain, items := range domainItems {
+				if len(items) == 0 {
+					continue
+				}
+				next := items[len(items)-1]
+				domainItems[domain] = items[:len(items)-1]
+				heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
+				anyPushed = true
+			}
+			if !anyPushed {
+				break // 所有域名都没有剩余条目，结束
+			}
 		}
+		item := heap.Pop(h).(rerankItem)
+		result = append(result, candidate{url: item.url, scoreVec: item.vec})
 		domain := netloc(item.url)
 		domainMul[domain] /= 8 // 该域名的下一次出现衰减到 1/8
 		remaining := domainItems[domain]
@@ -890,7 +950,11 @@ func rerank(candidates []candidate, from, to int) []candidate {
 			heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
 		}
 	}
-	return result
+	// 截取分页段
+	if from >= len(result) {
+		return nil
+	}
+	return result[from:]
 }

 // ---- 杂项辅助函数 ----