Signed-off-by: 吴文峰 <kevin@lmve.net>

This commit is contained in:
2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
+693
View File
@@ -0,0 +1,693 @@
// Package search implements the user-facing search HTTP server.
package search
import (
"container/heap"
"encoding/json"
"log"
"math"
"net/http"
"net/url"
"regexp"
"sort"
"strings"
"sync"
"time"
"sese-engine/analyzer"
"sese-engine/config"
"sese-engine/info"
"sese-engine/parser"
"sese-engine/storage"
)
// Server is the search HTTP server.
type Server struct {
db *storage.DB
infoSvc *info.Service
analyzer *analyzer.Analyzer
httpCli *http.Client // for online snippet fetching
}
// New creates a search Server.
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
return &Server{
db: db,
infoSvc: infoSvc,
analyzer: a,
httpCli: &http.Client{
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
},
}
}
// Handler returns the http.Handler.
func (s *Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/search", s.handleSearch)
return mux
}
// ListenAndServe starts the search server.
func (s *Server) ListenAndServe(addr string) error {
log.Printf("[search] listening on %s", addr)
return http.ListenAndServe(addr, s.Handler())
}
// ---- search handler ----
type searchResponse struct {
Tokens []string `json:"tokens"`
Counts map[string]int `json:"counts"`
Results []searchResult `json:"results"`
Total int `json:"total"`
}
type searchResult struct {
Score float64 `json:"score"`
URL string `json:"url"`
Snippet *snippetInfo `json:"snippet,omitempty"`
Relevance map[string]float64 `json:"relevance"`
DomainCount int `json:"domain_count"`
Factors map[string]float64 `json:"factors,omitempty"`
}
type snippetInfo struct {
Title string `json:"title"`
Description string `json:"description"`
Text string `json:"text"`
}
var siteRe = regexp.MustCompile(`^site:(.+)$`)
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
q := r.URL.Query().Get("q")
if q == "" {
if qh := r.URL.Query().Get("qh"); qh != "" {
decoded, err := url.PathUnescape(qh)
if err == nil {
q = decoded
}
}
}
// Parse slice param "0:10"
sliceStr := r.URL.Query().Get("slice")
sliceFrom, sliceTo := 0, 10
if sliceStr != "" {
parts := strings.SplitN(sliceStr, ":", 2)
if len(parts) == 2 {
a := atoi(parts[0])
b := atoi(parts[1])
if a >= 0 && b > a && b-a <= 20 {
sliceFrom, sliceTo = a, b
}
}
}
// Parse tokens and site filter
var tokens []string
var siteFilter string
for _, part := range strings.Fields(q) {
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
siteFilter = m[1]
} else {
segs := s.analyzer.Segment(part, false)
for _, t := range segs {
if !s.infoSvc.IsBlocked(t) {
tokens = append(tokens, t)
}
}
}
}
if len(tokens) > 20 {
tokens = tokens[:20]
}
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
// Count per keyword
counts := make(map[string]int, len(tokens))
for _, t := range tokens {
entries, _ := s.db.GetIndex(t)
counts[t] = len(entries)
}
resp := searchResponse{
Tokens: tokens,
Counts: counts,
Results: results,
Total: total,
}
json.NewEncoder(w).Encode(resp)
}
// query executes the multi-keyword search and returns ranked results.
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
if len(tokens) == 0 {
return nil, 0
}
// Load inverted index for each token
type tokenIndex struct {
token string
entries []storage.IndexEntry
defVal float64
}
tokenIndexes := make([]tokenIndex, 0, len(tokens))
for _, t := range tokens {
entries, _ := s.db.GetIndex(t)
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
if len(entries) >= config.MaxURLsPerKey {
weights := make([]float64, len(entries))
for i, e := range entries {
weights[i] = float64(e.Weight)
}
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
}
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
}
// Build URL → per-token weights map
urlWeights := make(map[string]map[string]float64)
for _, ti := range tokenIndexes {
for _, e := range ti.entries {
if urlWeights[e.URL] == nil {
urlWeights[e.URL] = make(map[string]float64)
}
urlWeights[e.URL][ti.token] = float64(e.Weight)
}
}
// Site filter
total := len(urlWeights)
if siteFilter != "" {
filtered := make(map[string]map[string]float64)
for u, vs := range urlWeights {
h := netloc(u)
if matchSite(h, siteFilter) {
filtered[u] = vs
}
}
urlWeights = filtered
total = len(urlWeights)
}
// Build default value map
defVals := make(map[string]float64, len(tokenIndexes))
for _, ti := range tokenIndexes {
defVals[ti.token] = ti.defVal
}
// Compute relevance + initial score for each URL
candidates := make([]candidate, 0, len(urlWeights))
for u, vs := range urlWeights {
rel := 1.0
for _, ti := range tokenIndexes {
vp := vs[ti.token]
if vp == 0 {
vp = defVals[ti.token]
}
if vp > 0.06 {
vp = math.Log((vp-0.06)*40+1)/40 + 0.06
}
rel *= vp
}
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
bad := badURL(u)
adjust := s.infoSvc.Adjust(netloc(u))
score := rel * prosper * (1 - bad) * adjust * 0.1
var vec [12]float64
vec[0] = score
vec[1] = rel
vec[2] = prosper
vec[3] = 1 - bad
vec[4] = 1 // language multiplier placeholder
vec[5] = 1 // repetition placeholder
vec[6] = adjust
vec[7] = 1 // time multiplier placeholder
vec[8] = 1 // consecutive keyword placeholder
vec[9] = 1 // keyword content placeholder
vec[10] = 1 // URL time placeholder
vec[11] = 0.1
candidates = append(candidates, candidate{u, rel, vec})
}
// Early relevance threshold
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
})
// Apply site info factors to top 256
now := time.Now().Unix()
limit256 := 256
if len(candidates) < 256 {
limit256 = len(candidates)
}
var wg sync.WaitGroup
for i := 0; i < limit256; i++ {
wg.Add(1)
go func(idx int) {
defer wg.Done()
c := &candidates[idx]
h := netloc(c.url)
siteInfo, _ := s.db.GetSiteInfo(h)
langMul := languageMultiplier(siteInfo)
timeMul := timeMul(siteInfo, now)
urlTimeMul := urlTimeMul(s.db, c.url, now)
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
c.scoreVec[4] = langMul
c.scoreVec[7] = timeMul
c.scoreVec[10] = urlTimeMul
}(i)
}
wg.Wait()
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
})
// Apply consecutive-keyword and repetition bonuses to top 80
limit80 := 80
if len(candidates) < 80 {
limit80 = len(candidates)
}
titles := make([]string, limit80)
for i := 0; i < limit80; i++ {
if snippet, err := s.db.GetSnippet(candidates[i].url); err == nil {
titles[i] = snippet.Title
}
}
// Repetition penaliser
for i := 0; i < limit80; i++ {
h := repetitionSimilarity(titles, i)
consecutive := consecutiveCount(titles[i], tokens)
repMul := 1.0
if h > 0.5 {
repMul = 1 - (h - 0.5)
}
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
candidates[i].scoreVec[0] *= repMul * consMul
candidates[i].scoreVec[5] = repMul
candidates[i].scoreVec[8] = consMul
}
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
})
// Re-rank: interleave domains
reranked := rerank(candidates, from, to)
// Fetch snippets and build output
results := make([]searchResult, 0, len(reranked))
var snippetMu sync.Mutex
var snippetWg sync.WaitGroup
for _, c := range reranked {
snippetWg.Add(1)
go func(cand candidate) {
defer snippetWg.Done()
snip := s.getSnippet(cand.url)
r := searchResult{
Score: cand.scoreVec[0],
URL: unescapeURL(cand.url),
Snippet: snip,
Relevance: make(map[string]float64),
DomainCount: 0,
Factors: map[string]float64{
"relevance": cand.scoreVec[1],
"backlink": cand.scoreVec[2],
"url_quality": cand.scoreVec[3],
"language": cand.scoreVec[4],
"repetition": cand.scoreVec[5],
"adjust": cand.scoreVec[6],
"site_time": cand.scoreVec[7],
"consecutive": cand.scoreVec[8],
"url_time": cand.scoreVec[10],
},
}
for _, ti := range tokenIndexes {
r.Relevance[ti.token] = urlWeights[cand.url][ti.token]
}
snippetMu.Lock()
results = append(results, r)
snippetMu.Unlock()
}(c)
}
snippetWg.Wait()
// Preserve order (goroutines may reorder)
urlOrder := make(map[string]int)
for i, c := range reranked {
urlOrder[c.url] = i
}
sort.Slice(results, func(i, j int) bool {
return urlOrder[results[i].URL] < urlOrder[results[j].URL]
})
return results, total
}
// getSnippet fetches (or caches) a snippet for a URL.
func (s *Server) getSnippet(rawURL string) *snippetInfo {
// Try cache first
if entry, err := s.db.GetSnippet(rawURL); err == nil {
snip := buildSnippet(entry)
return snip
}
if !config.UseOnlineSnippet {
return nil
}
// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
req, err := http.NewRequest("GET", rawURL, nil)
if err != nil {
return nil
}
req.Header.Set("User-Agent", config.SpiderName)
resp, err := s.httpCli.Do(req)
if err != nil || resp.StatusCode != 200 {
return nil
}
defer resp.Body.Close()
ct := resp.Header.Get("Content-Type")
if !strings.Contains(ct, "text/html") {
return nil
}
body := readBodyLimited(resp, 60000)
title, desc, text, _ := parser.ParseHTML(body, resp.Request.URL.String())
entry := &storage.SnippetEntry{
Title: title,
Description: truncate(desc, 256),
Text: truncate(text, 256),
Timestamp: time.Now().Unix(),
}
_ = s.db.SetSnippet(rawURL, entry)
return buildSnippet(entry)
}
func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
if entry == nil || (entry.Title == "" && entry.Description == "" && entry.Text == "") {
return nil
}
return &snippetInfo{
Title: entry.Title,
Description: entry.Description,
Text: entry.Text,
}
}
// ---- scoring helpers ----
func languageMultiplier(si *storage.SiteInfo) float64 {
if si == nil || len(si.Languages) == 0 {
return 1.0
}
total := 0.0
for _, v := range si.Languages {
total += v
}
chinese := si.Languages["zh"] / total
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
}
func timeMul(si *storage.SiteInfo, now int64) float64 {
if si == nil {
return 1.0
}
t := si.LastVisitTime
if t == 0 {
t = 1648000000
}
days := (now - t) / (3600 * 24)
if days < 0 {
days = 0
}
if days > 180 {
days = 180
}
if days > 0 {
days--
}
return math.Pow(config.WeightDailyDecay, float64(days))
}
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
entry, err := db.GetSnippet(rawURL)
if err != nil || entry == nil {
return 1.0
}
days := (now - entry.Timestamp) / (3600 * 24)
if days <= 30 {
return 1.0
}
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
}
func badURL(u string) float64 {
s := math.Max(0, float64(len(u)-30)/200.0)
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
s += (1 - s) * 0.3
}
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
s += (1 - s) * 0.1
}
if len(u) < 5 || u[4] == ':' {
s += (1 - s) * 0.3
}
return math.Min(s, 0.9)
}
func netloc(rawURL string) string {
parts := strings.SplitN(rawURL, "/", 4)
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
return parts[2]
}
return rawURL
}
func matchSite(host, pattern string) bool {
if host == pattern {
return true
}
if strings.HasSuffix(host, "."+pattern) {
return true
}
return false
}
func consecutiveCount(title string, tokens []string) int {
c := 0
for i := 0; i < len(tokens)-1; i++ {
if strings.Contains(title, tokens[i]+tokens[i+1]) {
c++
}
}
return c
}
func repetitionSimilarity(titles []string, idx int) float64 {
if idx == 0 {
return 0
}
t := titles[idx]
if t == "" {
return 0
}
best := 0.0
for _, prev := range titles[:idx] {
if prev == "" {
continue
}
sim := 1 - float64(levenshtein(t, prev))/float64(max(len(t), len(prev)))
if sim > best {
best = sim
}
}
return best
}
func levenshtein(a, b string) int {
ra := []rune(a)
rb := []rune(b)
la, lb := len(ra), len(rb)
if la == 0 {
return lb
}
if lb == 0 {
return la
}
prev := make([]int, lb+1)
curr := make([]int, lb+1)
for j := 0; j <= lb; j++ {
prev[j] = j
}
for i := 1; i <= la; i++ {
curr[0] = i
for j := 1; j <= lb; j++ {
cost := 1
if ra[i-1] == rb[j-1] {
cost = 0
}
curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost)
}
prev, curr = curr, prev
}
return prev[lb]
}
func min3(a, b, c int) int {
if a < b {
if a < c {
return a
}
return c
}
if b < c {
return b
}
return c
}
// rerank interleaves results from different domains.
type domainHeap []rerankItem
type rerankItem struct {
score float64
url string
domainMul float64
vec [12]float64
}
func (h domainHeap) Len() int { return len(h) }
func (h domainHeap) Less(i, j int) bool { return h[i].score*h[i].domainMul > h[j].score*h[j].domainMul }
func (h domainHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *domainHeap) Push(x interface{}) { *h = append(*h, x.(rerankItem)) }
func (h *domainHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[:n-1]
return x
}
type candidate struct {
url string
relevance float64
scoreVec [12]float64
}
func rerank(candidates []candidate, from, to int) []candidate {
domainItems := make(map[string][]candidate)
for _, c := range candidates {
h := netloc(c.url)
domainItems[h] = append(domainItems[h], c)
}
h := &domainHeap{}
heap.Init(h)
domainMul := make(map[string]float64)
for domain, items := range domainItems {
domainMul[domain] = 1.0
// Sort items within domain
sort.Slice(items, func(i, j int) bool {
return items[i].scoreVec[0] < items[j].scoreVec[0]
})
top := items[len(items)-1]
domainItems[domain] = items[:len(items)-1]
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
}
var result []candidate
for h.Len() > 0 && len(result) < to {
item := heap.Pop(h).(rerankItem)
if len(result) >= from {
result = append(result, candidate{url: item.url, scoreVec: item.vec})
}
domain := netloc(item.url)
domainMul[domain] /= 8
remaining := domainItems[domain]
if len(remaining) > 0 {
next := remaining[len(remaining)-1]
domainItems[domain] = remaining[:len(remaining)-1]
heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
}
}
return result
}
// ---- misc ----
func readBodyLimited(resp *http.Response, limit int64) string {
data := make([]byte, 0, limit)
buf := make([]byte, 4096)
var total int64
for {
n, err := resp.Body.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
total += int64(n)
if total >= limit {
break
}
}
if err != nil {
break
}
}
return string(data)
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n]
}
func unescapeURL(u string) string {
decoded, err := url.PathUnescape(u)
if err != nil {
return u
}
return decoded
}
func atoi(s string) int {
n := 0
for _, c := range s {
if c < '0' || c > '9' {
return n
}
n = n*10 + int(c-'0')
}
return n
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
func min(a, b int) int {
if a < b {
return a
}
return b
}