// Package search implements the user-facing search HTTP server. package search import ( "container/heap" "encoding/json" "log" "math" "net/http" "net/url" "regexp" "sort" "strings" "sync" "time" "sese-engine/analyzer" "sese-engine/config" "sese-engine/info" "sese-engine/parser" "sese-engine/storage" ) // Server is the search HTTP server. type Server struct { db *storage.DB infoSvc *info.Service analyzer *analyzer.Analyzer httpCli *http.Client // for online snippet fetching } // New creates a search Server. func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server { return &Server{ db: db, infoSvc: infoSvc, analyzer: a, httpCli: &http.Client{ Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second, }, } } // Handler returns the http.Handler. func (s *Server) Handler() http.Handler { mux := http.NewServeMux() mux.HandleFunc("/search", s.handleSearch) return mux } // ListenAndServe starts the search server. func (s *Server) ListenAndServe(addr string) error { log.Printf("[search] listening on %s", addr) return http.ListenAndServe(addr, s.Handler()) } // ---- search handler ---- type searchResponse struct { Tokens []string `json:"tokens"` Counts map[string]int `json:"counts"` Results []searchResult `json:"results"` Total int `json:"total"` } type searchResult struct { Score float64 `json:"score"` URL string `json:"url"` Snippet *snippetInfo `json:"snippet,omitempty"` Relevance map[string]float64 `json:"relevance"` DomainCount int `json:"domain_count"` Factors map[string]float64 `json:"factors,omitempty"` } type snippetInfo struct { Title string `json:"title"` Description string `json:"description"` Text string `json:"text"` } var siteRe = regexp.MustCompile(`^site:(.+)$`) func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) { w.Header().Set("Access-Control-Allow-Origin", "*") w.Header().Set("Content-Type", "application/json; charset=utf-8") q := r.URL.Query().Get("q") if q == "" { if qh := r.URL.Query().Get("qh"); qh != "" { decoded, err := url.PathUnescape(qh) if err == nil { q = decoded } } } // Parse slice param "0:10" sliceStr := r.URL.Query().Get("slice") sliceFrom, sliceTo := 0, 10 if sliceStr != "" { parts := strings.SplitN(sliceStr, ":", 2) if len(parts) == 2 { a := atoi(parts[0]) b := atoi(parts[1]) if a >= 0 && b > a && b-a <= 20 { sliceFrom, sliceTo = a, b } } } // Parse tokens and site filter var tokens []string var siteFilter string for _, part := range strings.Fields(q) { if m := siteRe.FindStringSubmatch(part); len(m) > 1 { siteFilter = m[1] } else { segs := s.analyzer.Segment(part, false) for _, t := range segs { if !s.infoSvc.IsBlocked(t) { tokens = append(tokens, t) } } } } if len(tokens) > 20 { tokens = tokens[:20] } results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter) // Count per keyword counts := make(map[string]int, len(tokens)) for _, t := range tokens { entries, _ := s.db.GetIndex(t) counts[t] = len(entries) } resp := searchResponse{ Tokens: tokens, Counts: counts, Results: results, Total: total, } json.NewEncoder(w).Encode(resp) } // query executes the multi-keyword search and returns ranked results. func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) { if len(tokens) == 0 { return nil, 0 } // Load inverted index for each token type tokenIndex struct { token string entries []storage.IndexEntry defVal float64 } tokenIndexes := make([]tokenIndex, 0, len(tokens)) for _, t := range tokens { entries, _ := s.db.GetIndex(t) defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey) if len(entries) >= config.MaxURLsPerKey { weights := make([]float64, len(entries)) for i, e := range entries { weights[i] = float64(e.Weight) } sort.Sort(sort.Reverse(sort.Float64Slice(weights))) defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2) } tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal}) } // Build URL → per-token weights map urlWeights := make(map[string]map[string]float64) for _, ti := range tokenIndexes { for _, e := range ti.entries { if urlWeights[e.URL] == nil { urlWeights[e.URL] = make(map[string]float64) } urlWeights[e.URL][ti.token] = float64(e.Weight) } } // Site filter total := len(urlWeights) if siteFilter != "" { filtered := make(map[string]map[string]float64) for u, vs := range urlWeights { h := netloc(u) if matchSite(h, siteFilter) { filtered[u] = vs } } urlWeights = filtered total = len(urlWeights) } // Build default value map defVals := make(map[string]float64, len(tokenIndexes)) for _, ti := range tokenIndexes { defVals[ti.token] = ti.defVal } // Compute relevance + initial score for each URL candidates := make([]candidate, 0, len(urlWeights)) for u, vs := range urlWeights { rel := 1.0 for _, ti := range tokenIndexes { vp := vs[ti.token] if vp == 0 { vp = defVals[ti.token] } if vp > 0.06 { vp = math.Log((vp-0.06)*40+1)/40 + 0.06 } rel *= vp } prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight bad := badURL(u) adjust := s.infoSvc.Adjust(netloc(u)) score := rel * prosper * (1 - bad) * adjust * 0.1 var vec [12]float64 vec[0] = score vec[1] = rel vec[2] = prosper vec[3] = 1 - bad vec[4] = 1 // language multiplier placeholder vec[5] = 1 // repetition placeholder vec[6] = adjust vec[7] = 1 // time multiplier placeholder vec[8] = 1 // consecutive keyword placeholder vec[9] = 1 // keyword content placeholder vec[10] = 1 // URL time placeholder vec[11] = 0.1 candidates = append(candidates, candidate{u, rel, vec}) } // Early relevance threshold sort.Slice(candidates, func(i, j int) bool { return candidates[i].scoreVec[0] > candidates[j].scoreVec[0] }) // Apply site info factors to top 256 now := time.Now().Unix() limit256 := 256 if len(candidates) < 256 { limit256 = len(candidates) } var wg sync.WaitGroup for i := 0; i < limit256; i++ { wg.Add(1) go func(idx int) { defer wg.Done() c := &candidates[idx] h := netloc(c.url) siteInfo, _ := s.db.GetSiteInfo(h) langMul := languageMultiplier(siteInfo) timeMul := timeMul(siteInfo, now) urlTimeMul := urlTimeMul(s.db, c.url, now) c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul c.scoreVec[4] = langMul c.scoreVec[7] = timeMul c.scoreVec[10] = urlTimeMul }(i) } wg.Wait() sort.Slice(candidates, func(i, j int) bool { return candidates[i].scoreVec[0] > candidates[j].scoreVec[0] }) // Apply consecutive-keyword and repetition bonuses to top 80 limit80 := 80 if len(candidates) < 80 { limit80 = len(candidates) } titles := make([]string, limit80) for i := 0; i < limit80; i++ { if snippet, err := s.db.GetSnippet(candidates[i].url); err == nil { titles[i] = snippet.Title } } // Repetition penaliser for i := 0; i < limit80; i++ { h := repetitionSimilarity(titles, i) consecutive := consecutiveCount(titles[i], tokens) repMul := 1.0 if h > 0.5 { repMul = 1 - (h - 0.5) } consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive)) candidates[i].scoreVec[0] *= repMul * consMul candidates[i].scoreVec[5] = repMul candidates[i].scoreVec[8] = consMul } sort.Slice(candidates, func(i, j int) bool { return candidates[i].scoreVec[0] > candidates[j].scoreVec[0] }) // Re-rank: interleave domains reranked := rerank(candidates, from, to) // Fetch snippets and build output results := make([]searchResult, 0, len(reranked)) var snippetMu sync.Mutex var snippetWg sync.WaitGroup for _, c := range reranked { snippetWg.Add(1) go func(cand candidate) { defer snippetWg.Done() snip := s.getSnippet(cand.url) r := searchResult{ Score: cand.scoreVec[0], URL: unescapeURL(cand.url), Snippet: snip, Relevance: make(map[string]float64), DomainCount: 0, Factors: map[string]float64{ "relevance": cand.scoreVec[1], "backlink": cand.scoreVec[2], "url_quality": cand.scoreVec[3], "language": cand.scoreVec[4], "repetition": cand.scoreVec[5], "adjust": cand.scoreVec[6], "site_time": cand.scoreVec[7], "consecutive": cand.scoreVec[8], "url_time": cand.scoreVec[10], }, } for _, ti := range tokenIndexes { r.Relevance[ti.token] = urlWeights[cand.url][ti.token] } snippetMu.Lock() results = append(results, r) snippetMu.Unlock() }(c) } snippetWg.Wait() // Preserve order (goroutines may reorder) urlOrder := make(map[string]int) for i, c := range reranked { urlOrder[c.url] = i } sort.Slice(results, func(i, j int) bool { return urlOrder[results[i].URL] < urlOrder[results[j].URL] }) return results, total } // getSnippet fetches (or caches) a snippet for a URL. func (s *Server) getSnippet(rawURL string) *snippetInfo { // Try cache first if entry, err := s.db.GetSnippet(rawURL); err == nil { snip := buildSnippet(entry) return snip } if !config.UseOnlineSnippet { return nil } // Fetch online with a simple HTTP client (no robots.txt check for search snippets) req, err := http.NewRequest("GET", rawURL, nil) if err != nil { return nil } req.Header.Set("User-Agent", config.SpiderName) resp, err := s.httpCli.Do(req) if err != nil || resp.StatusCode != 200 { return nil } defer resp.Body.Close() ct := resp.Header.Get("Content-Type") if !strings.Contains(ct, "text/html") { return nil } body := readBodyLimited(resp, 60000) title, desc, text, _ := parser.ParseHTML(body, resp.Request.URL.String()) entry := &storage.SnippetEntry{ Title: title, Description: truncate(desc, 256), Text: truncate(text, 256), Timestamp: time.Now().Unix(), } _ = s.db.SetSnippet(rawURL, entry) return buildSnippet(entry) } func buildSnippet(entry *storage.SnippetEntry) *snippetInfo { if entry == nil || (entry.Title == "" && entry.Description == "" && entry.Text == "") { return nil } return &snippetInfo{ Title: entry.Title, Description: entry.Description, Text: entry.Text, } } // ---- scoring helpers ---- func languageMultiplier(si *storage.SiteInfo) float64 { if si == nil || len(si.Languages) == 0 { return 1.0 } total := 0.0 for _, v := range si.Languages { total += v } chinese := si.Languages["zh"] / total weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight } func timeMul(si *storage.SiteInfo, now int64) float64 { if si == nil { return 1.0 } t := si.LastVisitTime if t == 0 { t = 1648000000 } days := (now - t) / (3600 * 24) if days < 0 { days = 0 } if days > 180 { days = 180 } if days > 0 { days-- } return math.Pow(config.WeightDailyDecay, float64(days)) } func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 { entry, err := db.GetSnippet(rawURL) if err != nil || entry == nil { return 1.0 } days := (now - entry.Timestamp) / (3600 * 24) if days <= 30 { return 1.0 } return math.Pow((2+config.WeightDailyDecay)/3, float64(days)) } func badURL(u string) float64 { s := math.Max(0, float64(len(u)-30)/200.0) if strings.Contains(u, ".htm") || strings.Contains(u, ".php") { s += (1 - s) * 0.3 } if strings.Count(strings.TrimRight(u, "/"), "/") > 2 { s += (1 - s) * 0.1 } if len(u) < 5 || u[4] == ':' { s += (1 - s) * 0.3 } return math.Min(s, 0.9) } func netloc(rawURL string) string { parts := strings.SplitN(rawURL, "/", 4) if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" { return parts[2] } return rawURL } func matchSite(host, pattern string) bool { if host == pattern { return true } if strings.HasSuffix(host, "."+pattern) { return true } return false } func consecutiveCount(title string, tokens []string) int { c := 0 for i := 0; i < len(tokens)-1; i++ { if strings.Contains(title, tokens[i]+tokens[i+1]) { c++ } } return c } func repetitionSimilarity(titles []string, idx int) float64 { if idx == 0 { return 0 } t := titles[idx] if t == "" { return 0 } best := 0.0 for _, prev := range titles[:idx] { if prev == "" { continue } sim := 1 - float64(levenshtein(t, prev))/float64(max(len(t), len(prev))) if sim > best { best = sim } } return best } func levenshtein(a, b string) int { ra := []rune(a) rb := []rune(b) la, lb := len(ra), len(rb) if la == 0 { return lb } if lb == 0 { return la } prev := make([]int, lb+1) curr := make([]int, lb+1) for j := 0; j <= lb; j++ { prev[j] = j } for i := 1; i <= la; i++ { curr[0] = i for j := 1; j <= lb; j++ { cost := 1 if ra[i-1] == rb[j-1] { cost = 0 } curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost) } prev, curr = curr, prev } return prev[lb] } func min3(a, b, c int) int { if a < b { if a < c { return a } return c } if b < c { return b } return c } // rerank interleaves results from different domains. type domainHeap []rerankItem type rerankItem struct { score float64 url string domainMul float64 vec [12]float64 } func (h domainHeap) Len() int { return len(h) } func (h domainHeap) Less(i, j int) bool { return h[i].score*h[i].domainMul > h[j].score*h[j].domainMul } func (h domainHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] } func (h *domainHeap) Push(x interface{}) { *h = append(*h, x.(rerankItem)) } func (h *domainHeap) Pop() interface{} { old := *h n := len(old) x := old[n-1] *h = old[:n-1] return x } type candidate struct { url string relevance float64 scoreVec [12]float64 } func rerank(candidates []candidate, from, to int) []candidate { domainItems := make(map[string][]candidate) for _, c := range candidates { h := netloc(c.url) domainItems[h] = append(domainItems[h], c) } h := &domainHeap{} heap.Init(h) domainMul := make(map[string]float64) for domain, items := range domainItems { domainMul[domain] = 1.0 // Sort items within domain sort.Slice(items, func(i, j int) bool { return items[i].scoreVec[0] < items[j].scoreVec[0] }) top := items[len(items)-1] domainItems[domain] = items[:len(items)-1] heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec}) } var result []candidate for h.Len() > 0 && len(result) < to { item := heap.Pop(h).(rerankItem) if len(result) >= from { result = append(result, candidate{url: item.url, scoreVec: item.vec}) } domain := netloc(item.url) domainMul[domain] /= 8 remaining := domainItems[domain] if len(remaining) > 0 { next := remaining[len(remaining)-1] domainItems[domain] = remaining[:len(remaining)-1] heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec}) } } return result } // ---- misc ---- func readBodyLimited(resp *http.Response, limit int64) string { data := make([]byte, 0, limit) buf := make([]byte, 4096) var total int64 for { n, err := resp.Body.Read(buf) if n > 0 { data = append(data, buf[:n]...) total += int64(n) if total >= limit { break } } if err != nil { break } } return string(data) } func truncate(s string, n int) string { if len(s) <= n { return s } return s[:n] } func unescapeURL(u string) string { decoded, err := url.PathUnescape(u) if err != nil { return u } return decoded } func atoi(s string) int { n := 0 for _, c := range s { if c < '0' || c > '9' { return n } n = n*10 + int(c-'0') } return n } func max(a, b int) int { if a > b { return a } return b } func min(a, b int) int { if a < b { return a } return b }