sese-engine-go/search/server.go

// Package search implements the user-facing search HTTP server.
package search

import (
	"container/heap"
	"encoding/json"
	"log"
	"math"
	"net/http"
	"net/url"
	"regexp"
	"sort"
	"strings"
	"sync"
	"time"

	"sese-engine/analyzer"
	"sese-engine/config"
	"sese-engine/info"
	"sese-engine/parser"
	"sese-engine/storage"
)

// Server is the search HTTP server.
type Server struct {
	db       *storage.DB
	infoSvc  *info.Service
	analyzer *analyzer.Analyzer
	httpCli  *http.Client // for online snippet fetching
}

// New creates a search Server.
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
	return &Server{
		db:       db,
		infoSvc:  infoSvc,
		analyzer: a,
		httpCli: &http.Client{
			Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
		},
	}
}

// Handler returns the http.Handler.
func (s *Server) Handler() http.Handler {
	mux := http.NewServeMux()
	mux.HandleFunc("/search", s.handleSearch)
	return mux
}

// ListenAndServe starts the search server.
func (s *Server) ListenAndServe(addr string) error {
	log.Printf("[search] listening on %s", addr)
	return http.ListenAndServe(addr, s.Handler())
}

// ---- search handler ----

type searchResponse struct {
	Tokens  []string            `json:"tokens"`
	Counts  map[string]int      `json:"counts"`
	Results []searchResult      `json:"results"`
	Total   int                 `json:"total"`
}

type searchResult struct {
	Score       float64            `json:"score"`
	URL         string             `json:"url"`
	Snippet     *snippetInfo       `json:"snippet,omitempty"`
	Relevance   map[string]float64 `json:"relevance"`
	DomainCount int                `json:"domain_count"`
	Factors     map[string]float64 `json:"factors,omitempty"`
}

type snippetInfo struct {
	Title       string `json:"title"`
	Description string `json:"description"`
	Text        string `json:"text"`
}

var siteRe = regexp.MustCompile(`^site:(.+)$`)

func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
	w.Header().Set("Access-Control-Allow-Origin", "*")
	w.Header().Set("Content-Type", "application/json; charset=utf-8")

	q := r.URL.Query().Get("q")
	if q == "" {
		if qh := r.URL.Query().Get("qh"); qh != "" {
			decoded, err := url.PathUnescape(qh)
			if err == nil {
				q = decoded
			}
		}
	}

	// Parse slice param "0:10"
	sliceStr := r.URL.Query().Get("slice")
	sliceFrom, sliceTo := 0, 10
	if sliceStr != "" {
		parts := strings.SplitN(sliceStr, ":", 2)
		if len(parts) == 2 {
			a := atoi(parts[0])
			b := atoi(parts[1])
			if a >= 0 && b > a && b-a <= 20 {
				sliceFrom, sliceTo = a, b
			}
		}
	}

	// Parse tokens and site filter
	var tokens []string
	var siteFilter string
	for _, part := range strings.Fields(q) {
		if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
			siteFilter = m[1]
		} else {
			segs := s.analyzer.Segment(part, false)
			for _, t := range segs {
				if !s.infoSvc.IsBlocked(t) {
					tokens = append(tokens, t)
				}
			}
		}
	}

	if len(tokens) > 20 {
		tokens = tokens[:20]
	}

	results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)

	// Count per keyword
	counts := make(map[string]int, len(tokens))
	for _, t := range tokens {
		entries, _ := s.db.GetIndex(t)
		counts[t] = len(entries)
	}

	resp := searchResponse{
		Tokens:  tokens,
		Counts:  counts,
		Results: results,
		Total:   total,
	}
	json.NewEncoder(w).Encode(resp)
}

// query executes the multi-keyword search and returns ranked results.
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
	if len(tokens) == 0 {
		return nil, 0
	}

	// Load inverted index for each token
	type tokenIndex struct {
		token   string
		entries []storage.IndexEntry
		defVal  float64
	}
	tokenIndexes := make([]tokenIndex, 0, len(tokens))
	for _, t := range tokens {
		entries, _ := s.db.GetIndex(t)
		defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
		if len(entries) >= config.MaxURLsPerKey {
			weights := make([]float64, len(entries))
			for i, e := range entries {
				weights[i] = float64(e.Weight)
			}
			sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
			defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
		}
		tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
	}

	// Build URL → per-token weights map
	urlWeights := make(map[string]map[string]float64)
	for _, ti := range tokenIndexes {
		for _, e := range ti.entries {
			if urlWeights[e.URL] == nil {
				urlWeights[e.URL] = make(map[string]float64)
			}
			urlWeights[e.URL][ti.token] = float64(e.Weight)
		}
	}

	// Site filter
	total := len(urlWeights)
	if siteFilter != "" {
		filtered := make(map[string]map[string]float64)
		for u, vs := range urlWeights {
			h := netloc(u)
			if matchSite(h, siteFilter) {
				filtered[u] = vs
			}
		}
		urlWeights = filtered
		total = len(urlWeights)
	}

	// Build default value map
	defVals := make(map[string]float64, len(tokenIndexes))
	for _, ti := range tokenIndexes {
		defVals[ti.token] = ti.defVal
	}

	// Compute relevance + initial score for each URL
	candidates := make([]candidate, 0, len(urlWeights))
	for u, vs := range urlWeights {
		rel := 1.0
		for _, ti := range tokenIndexes {
			vp := vs[ti.token]
			if vp == 0 {
				vp = defVals[ti.token]
			}
			if vp > 0.06 {
				vp = math.Log((vp-0.06)*40+1)/40 + 0.06
			}
			rel *= vp
		}
		prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
		bad := badURL(u)
		adjust := s.infoSvc.Adjust(netloc(u))
		score := rel * prosper * (1 - bad) * adjust * 0.1

		var vec [12]float64
		vec[0] = score
		vec[1] = rel
		vec[2] = prosper
		vec[3] = 1 - bad
		vec[4] = 1  // language multiplier placeholder
		vec[5] = 1  // repetition placeholder
		vec[6] = adjust
		vec[7] = 1  // time multiplier placeholder
		vec[8] = 1  // consecutive keyword placeholder
		vec[9] = 1  // keyword content placeholder
		vec[10] = 1 // URL time placeholder
		vec[11] = 0.1

		candidates = append(candidates, candidate{u, rel, vec})
	}

	// Early relevance threshold
	sort.Slice(candidates, func(i, j int) bool {
		return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
	})

	// Apply site info factors to top 256
	now := time.Now().Unix()
	limit256 := 256
	if len(candidates) < 256 {
		limit256 = len(candidates)
	}

	var wg sync.WaitGroup
	for i := 0; i < limit256; i++ {
		wg.Add(1)
		go func(idx int) {
			defer wg.Done()
			c := &candidates[idx]
			h := netloc(c.url)
			siteInfo, _ := s.db.GetSiteInfo(h)
			langMul := languageMultiplier(siteInfo)
			timeMul := timeMul(siteInfo, now)
			urlTimeMul := urlTimeMul(s.db, c.url, now)

			c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
			c.scoreVec[4] = langMul
			c.scoreVec[7] = timeMul
			c.scoreVec[10] = urlTimeMul
		}(i)
	}
	wg.Wait()

	sort.Slice(candidates, func(i, j int) bool {
		return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
	})

	// Apply consecutive-keyword and repetition bonuses to top 80
	limit80 := 80
	if len(candidates) < 80 {
		limit80 = len(candidates)
	}

	titles := make([]string, limit80)
	for i := 0; i < limit80; i++ {
		if snippet, err := s.db.GetSnippet(candidates[i].url); err == nil {
			titles[i] = snippet.Title
		}
	}

	// Repetition penaliser
	for i := 0; i < limit80; i++ {
		h := repetitionSimilarity(titles, i)
		consecutive := consecutiveCount(titles[i], tokens)
		repMul := 1.0
		if h > 0.5 {
			repMul = 1 - (h - 0.5)
		}
		consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
		candidates[i].scoreVec[0] *= repMul * consMul
		candidates[i].scoreVec[5] = repMul
		candidates[i].scoreVec[8] = consMul
	}

	sort.Slice(candidates, func(i, j int) bool {
		return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
	})

	// Re-rank: interleave domains
	reranked := rerank(candidates, from, to)

	// Fetch snippets and build output
	results := make([]searchResult, 0, len(reranked))
	var snippetMu sync.Mutex
	var snippetWg sync.WaitGroup

	for _, c := range reranked {
		snippetWg.Add(1)
		go func(cand candidate) {
			defer snippetWg.Done()
			snip := s.getSnippet(cand.url)
			r := searchResult{
				Score: cand.scoreVec[0],
				URL:   unescapeURL(cand.url),
				Snippet: snip,
				Relevance: make(map[string]float64),
				DomainCount: 0,
				Factors: map[string]float64{
					"relevance":    cand.scoreVec[1],
					"backlink":     cand.scoreVec[2],
					"url_quality":  cand.scoreVec[3],
					"language":     cand.scoreVec[4],
					"repetition":   cand.scoreVec[5],
					"adjust":       cand.scoreVec[6],
					"site_time":    cand.scoreVec[7],
					"consecutive":  cand.scoreVec[8],
					"url_time":     cand.scoreVec[10],
				},
			}
			for _, ti := range tokenIndexes {
				r.Relevance[ti.token] = urlWeights[cand.url][ti.token]
			}
			snippetMu.Lock()
			results = append(results, r)
			snippetMu.Unlock()
		}(c)
	}
	snippetWg.Wait()

	// Preserve order (goroutines may reorder)
	urlOrder := make(map[string]int)
	for i, c := range reranked {
		urlOrder[c.url] = i
	}
	sort.Slice(results, func(i, j int) bool {
		return urlOrder[results[i].URL] < urlOrder[results[j].URL]
	})

	return results, total
}

// getSnippet fetches (or caches) a snippet for a URL.
func (s *Server) getSnippet(rawURL string) *snippetInfo {
	// Try cache first
	if entry, err := s.db.GetSnippet(rawURL); err == nil {
		snip := buildSnippet(entry)
		return snip
	}
	if !config.UseOnlineSnippet {
		return nil
	}
	// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
	req, err := http.NewRequest("GET", rawURL, nil)
	if err != nil {
		return nil
	}
	req.Header.Set("User-Agent", config.SpiderName)
	resp, err := s.httpCli.Do(req)
	if err != nil || resp.StatusCode != 200 {
		return nil
	}
	defer resp.Body.Close()

	ct := resp.Header.Get("Content-Type")
	if !strings.Contains(ct, "text/html") {
		return nil
	}
	body := readBodyLimited(resp, 60000)
	title, desc, text, _ := parser.ParseHTML(body, resp.Request.URL.String())
	entry := &storage.SnippetEntry{
		Title:       title,
		Description: truncate(desc, 256),
		Text:        truncate(text, 256),
		Timestamp:   time.Now().Unix(),
	}
	_ = s.db.SetSnippet(rawURL, entry)
	return buildSnippet(entry)
}

func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
	if entry == nil || (entry.Title == "" && entry.Description == "" && entry.Text == "") {
		return nil
	}
	return &snippetInfo{
		Title:       entry.Title,
		Description: entry.Description,
		Text:        entry.Text,
	}
}

// ---- scoring helpers ----

func languageMultiplier(si *storage.SiteInfo) float64 {
	if si == nil || len(si.Languages) == 0 {
		return 1.0
	}
	total := 0.0
	for _, v := range si.Languages {
		total += v
	}
	chinese := si.Languages["zh"] / total
	weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
	return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
}

func timeMul(si *storage.SiteInfo, now int64) float64 {
	if si == nil {
		return 1.0
	}
	t := si.LastVisitTime
	if t == 0 {
		t = 1648000000
	}
	days := (now - t) / (3600 * 24)
	if days < 0 {
		days = 0
	}
	if days > 180 {
		days = 180
	}
	if days > 0 {
		days--
	}
	return math.Pow(config.WeightDailyDecay, float64(days))
}

func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
	entry, err := db.GetSnippet(rawURL)
	if err != nil || entry == nil {
		return 1.0
	}
	days := (now - entry.Timestamp) / (3600 * 24)
	if days <= 30 {
		return 1.0
	}
	return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
}

func badURL(u string) float64 {
	s := math.Max(0, float64(len(u)-30)/200.0)
	if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
		s += (1 - s) * 0.3
	}
	if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
		s += (1 - s) * 0.1
	}
	if len(u) < 5 || u[4] == ':' {
		s += (1 - s) * 0.3
	}
	return math.Min(s, 0.9)
}

func netloc(rawURL string) string {
	parts := strings.SplitN(rawURL, "/", 4)
	if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
		return parts[2]
	}
	return rawURL
}

func matchSite(host, pattern string) bool {
	if host == pattern {
		return true
	}
	if strings.HasSuffix(host, "."+pattern) {
		return true
	}
	return false
}

func consecutiveCount(title string, tokens []string) int {
	c := 0
	for i := 0; i < len(tokens)-1; i++ {
		if strings.Contains(title, tokens[i]+tokens[i+1]) {
			c++
		}
	}
	return c
}

func repetitionSimilarity(titles []string, idx int) float64 {
	if idx == 0 {
		return 0
	}
	t := titles[idx]
	if t == "" {
		return 0
	}
	best := 0.0
	for _, prev := range titles[:idx] {
		if prev == "" {
			continue
		}
		sim := 1 - float64(levenshtein(t, prev))/float64(max(len(t), len(prev)))
		if sim > best {
			best = sim
		}
	}
	return best
}

func levenshtein(a, b string) int {
	ra := []rune(a)
	rb := []rune(b)
	la, lb := len(ra), len(rb)
	if la == 0 {
		return lb
	}
	if lb == 0 {
		return la
	}
	prev := make([]int, lb+1)
	curr := make([]int, lb+1)
	for j := 0; j <= lb; j++ {
		prev[j] = j
	}
	for i := 1; i <= la; i++ {
		curr[0] = i
		for j := 1; j <= lb; j++ {
			cost := 1
			if ra[i-1] == rb[j-1] {
				cost = 0
			}
			curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost)
		}
		prev, curr = curr, prev
	}
	return prev[lb]
}

func min3(a, b, c int) int {
	if a < b {
		if a < c {
			return a
		}
		return c
	}
	if b < c {
		return b
	}
	return c
}

// rerank interleaves results from different domains.
type domainHeap []rerankItem

type rerankItem struct {
	score    float64
	url      string
	domainMul float64
	vec      [12]float64
}

func (h domainHeap) Len() int            { return len(h) }
func (h domainHeap) Less(i, j int) bool  { return h[i].score*h[i].domainMul > h[j].score*h[j].domainMul }
func (h domainHeap) Swap(i, j int)       { h[i], h[j] = h[j], h[i] }
func (h *domainHeap) Push(x interface{}) { *h = append(*h, x.(rerankItem)) }
func (h *domainHeap) Pop() interface{} {
	old := *h
	n := len(old)
	x := old[n-1]
	*h = old[:n-1]
	return x
}

type candidate struct {
	url       string
	relevance float64
	scoreVec  [12]float64
}

func rerank(candidates []candidate, from, to int) []candidate {
	domainItems := make(map[string][]candidate)
	for _, c := range candidates {
		h := netloc(c.url)
		domainItems[h] = append(domainItems[h], c)
	}

	h := &domainHeap{}
	heap.Init(h)
	domainMul := make(map[string]float64)

	for domain, items := range domainItems {
		domainMul[domain] = 1.0
		// Sort items within domain
		sort.Slice(items, func(i, j int) bool {
			return items[i].scoreVec[0] < items[j].scoreVec[0]
		})
		top := items[len(items)-1]
		domainItems[domain] = items[:len(items)-1]
		heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
	}

	var result []candidate
	for h.Len() > 0 && len(result) < to {
		item := heap.Pop(h).(rerankItem)
		if len(result) >= from {
			result = append(result, candidate{url: item.url, scoreVec: item.vec})
		}
		domain := netloc(item.url)
		domainMul[domain] /= 8
		remaining := domainItems[domain]
		if len(remaining) > 0 {
			next := remaining[len(remaining)-1]
			domainItems[domain] = remaining[:len(remaining)-1]
			heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
		}
	}
	return result
}

// ---- misc ----

func readBodyLimited(resp *http.Response, limit int64) string {
	data := make([]byte, 0, limit)
	buf := make([]byte, 4096)
	var total int64
	for {
		n, err := resp.Body.Read(buf)
		if n > 0 {
			data = append(data, buf[:n]...)
			total += int64(n)
			if total >= limit {
				break
			}
		}
		if err != nil {
			break
		}
	}
	return string(data)
}

func truncate(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n]
}

func unescapeURL(u string) string {
	decoded, err := url.PathUnescape(u)
	if err != nil {
		return u
	}
	return decoded
}

func atoi(s string) int {
	n := 0
	for _, c := range s {
		if c < '0' || c > '9' {
			return n
		}
		n = n*10 + int(c-'0')
	}
	return n
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}