694 lines
16 KiB
Go
694 lines
16 KiB
Go
// Package search implements the user-facing search HTTP server.
|
|
package search
|
|
|
|
import (
|
|
"container/heap"
|
|
"encoding/json"
|
|
"log"
|
|
"math"
|
|
"net/http"
|
|
"net/url"
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"sese-engine/analyzer"
|
|
"sese-engine/config"
|
|
"sese-engine/info"
|
|
"sese-engine/parser"
|
|
"sese-engine/storage"
|
|
)
|
|
|
|
// Server is the search HTTP server.
|
|
type Server struct {
|
|
db *storage.DB
|
|
infoSvc *info.Service
|
|
analyzer *analyzer.Analyzer
|
|
httpCli *http.Client // for online snippet fetching
|
|
}
|
|
|
|
// New creates a search Server.
|
|
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
|
return &Server{
|
|
db: db,
|
|
infoSvc: infoSvc,
|
|
analyzer: a,
|
|
httpCli: &http.Client{
|
|
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
|
|
},
|
|
}
|
|
}
|
|
|
|
// Handler returns the http.Handler.
|
|
func (s *Server) Handler() http.Handler {
|
|
mux := http.NewServeMux()
|
|
mux.HandleFunc("/search", s.handleSearch)
|
|
return mux
|
|
}
|
|
|
|
// ListenAndServe starts the search server.
|
|
func (s *Server) ListenAndServe(addr string) error {
|
|
log.Printf("[search] listening on %s", addr)
|
|
return http.ListenAndServe(addr, s.Handler())
|
|
}
|
|
|
|
// ---- search handler ----
|
|
|
|
type searchResponse struct {
|
|
Tokens []string `json:"tokens"`
|
|
Counts map[string]int `json:"counts"`
|
|
Results []searchResult `json:"results"`
|
|
Total int `json:"total"`
|
|
}
|
|
|
|
type searchResult struct {
|
|
Score float64 `json:"score"`
|
|
URL string `json:"url"`
|
|
Snippet *snippetInfo `json:"snippet,omitempty"`
|
|
Relevance map[string]float64 `json:"relevance"`
|
|
DomainCount int `json:"domain_count"`
|
|
Factors map[string]float64 `json:"factors,omitempty"`
|
|
}
|
|
|
|
type snippetInfo struct {
|
|
Title string `json:"title"`
|
|
Description string `json:"description"`
|
|
Text string `json:"text"`
|
|
}
|
|
|
|
var siteRe = regexp.MustCompile(`^site:(.+)$`)
|
|
|
|
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
|
w.Header().Set("Access-Control-Allow-Origin", "*")
|
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
|
|
|
q := r.URL.Query().Get("q")
|
|
if q == "" {
|
|
if qh := r.URL.Query().Get("qh"); qh != "" {
|
|
decoded, err := url.PathUnescape(qh)
|
|
if err == nil {
|
|
q = decoded
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse slice param "0:10"
|
|
sliceStr := r.URL.Query().Get("slice")
|
|
sliceFrom, sliceTo := 0, 10
|
|
if sliceStr != "" {
|
|
parts := strings.SplitN(sliceStr, ":", 2)
|
|
if len(parts) == 2 {
|
|
a := atoi(parts[0])
|
|
b := atoi(parts[1])
|
|
if a >= 0 && b > a && b-a <= 20 {
|
|
sliceFrom, sliceTo = a, b
|
|
}
|
|
}
|
|
}
|
|
|
|
// Parse tokens and site filter
|
|
var tokens []string
|
|
var siteFilter string
|
|
for _, part := range strings.Fields(q) {
|
|
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
|
|
siteFilter = m[1]
|
|
} else {
|
|
segs := s.analyzer.Segment(part, false)
|
|
for _, t := range segs {
|
|
if !s.infoSvc.IsBlocked(t) {
|
|
tokens = append(tokens, t)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if len(tokens) > 20 {
|
|
tokens = tokens[:20]
|
|
}
|
|
|
|
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
|
|
|
|
// Count per keyword
|
|
counts := make(map[string]int, len(tokens))
|
|
for _, t := range tokens {
|
|
entries, _ := s.db.GetIndex(t)
|
|
counts[t] = len(entries)
|
|
}
|
|
|
|
resp := searchResponse{
|
|
Tokens: tokens,
|
|
Counts: counts,
|
|
Results: results,
|
|
Total: total,
|
|
}
|
|
json.NewEncoder(w).Encode(resp)
|
|
}
|
|
|
|
// query executes the multi-keyword search and returns ranked results.
|
|
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
|
|
if len(tokens) == 0 {
|
|
return nil, 0
|
|
}
|
|
|
|
// Load inverted index for each token
|
|
type tokenIndex struct {
|
|
token string
|
|
entries []storage.IndexEntry
|
|
defVal float64
|
|
}
|
|
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
|
for _, t := range tokens {
|
|
entries, _ := s.db.GetIndex(t)
|
|
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
|
if len(entries) >= config.MaxURLsPerKey {
|
|
weights := make([]float64, len(entries))
|
|
for i, e := range entries {
|
|
weights[i] = float64(e.Weight)
|
|
}
|
|
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
|
|
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
|
|
}
|
|
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
|
}
|
|
|
|
// Build URL → per-token weights map
|
|
urlWeights := make(map[string]map[string]float64)
|
|
for _, ti := range tokenIndexes {
|
|
for _, e := range ti.entries {
|
|
if urlWeights[e.URL] == nil {
|
|
urlWeights[e.URL] = make(map[string]float64)
|
|
}
|
|
urlWeights[e.URL][ti.token] = float64(e.Weight)
|
|
}
|
|
}
|
|
|
|
// Site filter
|
|
total := len(urlWeights)
|
|
if siteFilter != "" {
|
|
filtered := make(map[string]map[string]float64)
|
|
for u, vs := range urlWeights {
|
|
h := netloc(u)
|
|
if matchSite(h, siteFilter) {
|
|
filtered[u] = vs
|
|
}
|
|
}
|
|
urlWeights = filtered
|
|
total = len(urlWeights)
|
|
}
|
|
|
|
// Build default value map
|
|
defVals := make(map[string]float64, len(tokenIndexes))
|
|
for _, ti := range tokenIndexes {
|
|
defVals[ti.token] = ti.defVal
|
|
}
|
|
|
|
// Compute relevance + initial score for each URL
|
|
candidates := make([]candidate, 0, len(urlWeights))
|
|
for u, vs := range urlWeights {
|
|
rel := 1.0
|
|
for _, ti := range tokenIndexes {
|
|
vp := vs[ti.token]
|
|
if vp == 0 {
|
|
vp = defVals[ti.token]
|
|
}
|
|
if vp > 0.06 {
|
|
vp = math.Log((vp-0.06)*40+1)/40 + 0.06
|
|
}
|
|
rel *= vp
|
|
}
|
|
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
|
bad := badURL(u)
|
|
adjust := s.infoSvc.Adjust(netloc(u))
|
|
score := rel * prosper * (1 - bad) * adjust * 0.1
|
|
|
|
var vec [12]float64
|
|
vec[0] = score
|
|
vec[1] = rel
|
|
vec[2] = prosper
|
|
vec[3] = 1 - bad
|
|
vec[4] = 1 // language multiplier placeholder
|
|
vec[5] = 1 // repetition placeholder
|
|
vec[6] = adjust
|
|
vec[7] = 1 // time multiplier placeholder
|
|
vec[8] = 1 // consecutive keyword placeholder
|
|
vec[9] = 1 // keyword content placeholder
|
|
vec[10] = 1 // URL time placeholder
|
|
vec[11] = 0.1
|
|
|
|
candidates = append(candidates, candidate{u, rel, vec})
|
|
}
|
|
|
|
// Early relevance threshold
|
|
sort.Slice(candidates, func(i, j int) bool {
|
|
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
|
})
|
|
|
|
// Apply site info factors to top 256
|
|
now := time.Now().Unix()
|
|
limit256 := 256
|
|
if len(candidates) < 256 {
|
|
limit256 = len(candidates)
|
|
}
|
|
|
|
var wg sync.WaitGroup
|
|
for i := 0; i < limit256; i++ {
|
|
wg.Add(1)
|
|
go func(idx int) {
|
|
defer wg.Done()
|
|
c := &candidates[idx]
|
|
h := netloc(c.url)
|
|
siteInfo, _ := s.db.GetSiteInfo(h)
|
|
langMul := languageMultiplier(siteInfo)
|
|
timeMul := timeMul(siteInfo, now)
|
|
urlTimeMul := urlTimeMul(s.db, c.url, now)
|
|
|
|
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
|
|
c.scoreVec[4] = langMul
|
|
c.scoreVec[7] = timeMul
|
|
c.scoreVec[10] = urlTimeMul
|
|
}(i)
|
|
}
|
|
wg.Wait()
|
|
|
|
sort.Slice(candidates, func(i, j int) bool {
|
|
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
|
})
|
|
|
|
// Apply consecutive-keyword and repetition bonuses to top 80
|
|
limit80 := 80
|
|
if len(candidates) < 80 {
|
|
limit80 = len(candidates)
|
|
}
|
|
|
|
titles := make([]string, limit80)
|
|
for i := 0; i < limit80; i++ {
|
|
if snippet, err := s.db.GetSnippet(candidates[i].url); err == nil {
|
|
titles[i] = snippet.Title
|
|
}
|
|
}
|
|
|
|
// Repetition penaliser
|
|
for i := 0; i < limit80; i++ {
|
|
h := repetitionSimilarity(titles, i)
|
|
consecutive := consecutiveCount(titles[i], tokens)
|
|
repMul := 1.0
|
|
if h > 0.5 {
|
|
repMul = 1 - (h - 0.5)
|
|
}
|
|
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
|
candidates[i].scoreVec[0] *= repMul * consMul
|
|
candidates[i].scoreVec[5] = repMul
|
|
candidates[i].scoreVec[8] = consMul
|
|
}
|
|
|
|
sort.Slice(candidates, func(i, j int) bool {
|
|
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
|
})
|
|
|
|
// Re-rank: interleave domains
|
|
reranked := rerank(candidates, from, to)
|
|
|
|
// Fetch snippets and build output
|
|
results := make([]searchResult, 0, len(reranked))
|
|
var snippetMu sync.Mutex
|
|
var snippetWg sync.WaitGroup
|
|
|
|
for _, c := range reranked {
|
|
snippetWg.Add(1)
|
|
go func(cand candidate) {
|
|
defer snippetWg.Done()
|
|
snip := s.getSnippet(cand.url)
|
|
r := searchResult{
|
|
Score: cand.scoreVec[0],
|
|
URL: unescapeURL(cand.url),
|
|
Snippet: snip,
|
|
Relevance: make(map[string]float64),
|
|
DomainCount: 0,
|
|
Factors: map[string]float64{
|
|
"relevance": cand.scoreVec[1],
|
|
"backlink": cand.scoreVec[2],
|
|
"url_quality": cand.scoreVec[3],
|
|
"language": cand.scoreVec[4],
|
|
"repetition": cand.scoreVec[5],
|
|
"adjust": cand.scoreVec[6],
|
|
"site_time": cand.scoreVec[7],
|
|
"consecutive": cand.scoreVec[8],
|
|
"url_time": cand.scoreVec[10],
|
|
},
|
|
}
|
|
for _, ti := range tokenIndexes {
|
|
r.Relevance[ti.token] = urlWeights[cand.url][ti.token]
|
|
}
|
|
snippetMu.Lock()
|
|
results = append(results, r)
|
|
snippetMu.Unlock()
|
|
}(c)
|
|
}
|
|
snippetWg.Wait()
|
|
|
|
// Preserve order (goroutines may reorder)
|
|
urlOrder := make(map[string]int)
|
|
for i, c := range reranked {
|
|
urlOrder[c.url] = i
|
|
}
|
|
sort.Slice(results, func(i, j int) bool {
|
|
return urlOrder[results[i].URL] < urlOrder[results[j].URL]
|
|
})
|
|
|
|
return results, total
|
|
}
|
|
|
|
// getSnippet fetches (or caches) a snippet for a URL.
|
|
func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
|
// Try cache first
|
|
if entry, err := s.db.GetSnippet(rawURL); err == nil {
|
|
snip := buildSnippet(entry)
|
|
return snip
|
|
}
|
|
if !config.UseOnlineSnippet {
|
|
return nil
|
|
}
|
|
// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
|
|
req, err := http.NewRequest("GET", rawURL, nil)
|
|
if err != nil {
|
|
return nil
|
|
}
|
|
req.Header.Set("User-Agent", config.SpiderName)
|
|
resp, err := s.httpCli.Do(req)
|
|
if err != nil || resp.StatusCode != 200 {
|
|
return nil
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
ct := resp.Header.Get("Content-Type")
|
|
if !strings.Contains(ct, "text/html") {
|
|
return nil
|
|
}
|
|
body := readBodyLimited(resp, 60000)
|
|
title, desc, text, _ := parser.ParseHTML(body, resp.Request.URL.String())
|
|
entry := &storage.SnippetEntry{
|
|
Title: title,
|
|
Description: truncate(desc, 256),
|
|
Text: truncate(text, 256),
|
|
Timestamp: time.Now().Unix(),
|
|
}
|
|
_ = s.db.SetSnippet(rawURL, entry)
|
|
return buildSnippet(entry)
|
|
}
|
|
|
|
func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
|
|
if entry == nil || (entry.Title == "" && entry.Description == "" && entry.Text == "") {
|
|
return nil
|
|
}
|
|
return &snippetInfo{
|
|
Title: entry.Title,
|
|
Description: entry.Description,
|
|
Text: entry.Text,
|
|
}
|
|
}
|
|
|
|
// ---- scoring helpers ----
|
|
|
|
func languageMultiplier(si *storage.SiteInfo) float64 {
|
|
if si == nil || len(si.Languages) == 0 {
|
|
return 1.0
|
|
}
|
|
total := 0.0
|
|
for _, v := range si.Languages {
|
|
total += v
|
|
}
|
|
chinese := si.Languages["zh"] / total
|
|
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
|
|
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
|
}
|
|
|
|
func timeMul(si *storage.SiteInfo, now int64) float64 {
|
|
if si == nil {
|
|
return 1.0
|
|
}
|
|
t := si.LastVisitTime
|
|
if t == 0 {
|
|
t = 1648000000
|
|
}
|
|
days := (now - t) / (3600 * 24)
|
|
if days < 0 {
|
|
days = 0
|
|
}
|
|
if days > 180 {
|
|
days = 180
|
|
}
|
|
if days > 0 {
|
|
days--
|
|
}
|
|
return math.Pow(config.WeightDailyDecay, float64(days))
|
|
}
|
|
|
|
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
|
entry, err := db.GetSnippet(rawURL)
|
|
if err != nil || entry == nil {
|
|
return 1.0
|
|
}
|
|
days := (now - entry.Timestamp) / (3600 * 24)
|
|
if days <= 30 {
|
|
return 1.0
|
|
}
|
|
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
|
}
|
|
|
|
func badURL(u string) float64 {
|
|
s := math.Max(0, float64(len(u)-30)/200.0)
|
|
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
|
s += (1 - s) * 0.3
|
|
}
|
|
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
|
s += (1 - s) * 0.1
|
|
}
|
|
if len(u) < 5 || u[4] == ':' {
|
|
s += (1 - s) * 0.3
|
|
}
|
|
return math.Min(s, 0.9)
|
|
}
|
|
|
|
func netloc(rawURL string) string {
|
|
parts := strings.SplitN(rawURL, "/", 4)
|
|
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
|
return parts[2]
|
|
}
|
|
return rawURL
|
|
}
|
|
|
|
func matchSite(host, pattern string) bool {
|
|
if host == pattern {
|
|
return true
|
|
}
|
|
if strings.HasSuffix(host, "."+pattern) {
|
|
return true
|
|
}
|
|
return false
|
|
}
|
|
|
|
func consecutiveCount(title string, tokens []string) int {
|
|
c := 0
|
|
for i := 0; i < len(tokens)-1; i++ {
|
|
if strings.Contains(title, tokens[i]+tokens[i+1]) {
|
|
c++
|
|
}
|
|
}
|
|
return c
|
|
}
|
|
|
|
func repetitionSimilarity(titles []string, idx int) float64 {
|
|
if idx == 0 {
|
|
return 0
|
|
}
|
|
t := titles[idx]
|
|
if t == "" {
|
|
return 0
|
|
}
|
|
best := 0.0
|
|
for _, prev := range titles[:idx] {
|
|
if prev == "" {
|
|
continue
|
|
}
|
|
sim := 1 - float64(levenshtein(t, prev))/float64(max(len(t), len(prev)))
|
|
if sim > best {
|
|
best = sim
|
|
}
|
|
}
|
|
return best
|
|
}
|
|
|
|
func levenshtein(a, b string) int {
|
|
ra := []rune(a)
|
|
rb := []rune(b)
|
|
la, lb := len(ra), len(rb)
|
|
if la == 0 {
|
|
return lb
|
|
}
|
|
if lb == 0 {
|
|
return la
|
|
}
|
|
prev := make([]int, lb+1)
|
|
curr := make([]int, lb+1)
|
|
for j := 0; j <= lb; j++ {
|
|
prev[j] = j
|
|
}
|
|
for i := 1; i <= la; i++ {
|
|
curr[0] = i
|
|
for j := 1; j <= lb; j++ {
|
|
cost := 1
|
|
if ra[i-1] == rb[j-1] {
|
|
cost = 0
|
|
}
|
|
curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost)
|
|
}
|
|
prev, curr = curr, prev
|
|
}
|
|
return prev[lb]
|
|
}
|
|
|
|
func min3(a, b, c int) int {
|
|
if a < b {
|
|
if a < c {
|
|
return a
|
|
}
|
|
return c
|
|
}
|
|
if b < c {
|
|
return b
|
|
}
|
|
return c
|
|
}
|
|
|
|
// rerank interleaves results from different domains.
|
|
type domainHeap []rerankItem
|
|
|
|
type rerankItem struct {
|
|
score float64
|
|
url string
|
|
domainMul float64
|
|
vec [12]float64
|
|
}
|
|
|
|
func (h domainHeap) Len() int { return len(h) }
|
|
func (h domainHeap) Less(i, j int) bool { return h[i].score*h[i].domainMul > h[j].score*h[j].domainMul }
|
|
func (h domainHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
|
func (h *domainHeap) Push(x interface{}) { *h = append(*h, x.(rerankItem)) }
|
|
func (h *domainHeap) Pop() interface{} {
|
|
old := *h
|
|
n := len(old)
|
|
x := old[n-1]
|
|
*h = old[:n-1]
|
|
return x
|
|
}
|
|
|
|
type candidate struct {
|
|
url string
|
|
relevance float64
|
|
scoreVec [12]float64
|
|
}
|
|
|
|
func rerank(candidates []candidate, from, to int) []candidate {
|
|
domainItems := make(map[string][]candidate)
|
|
for _, c := range candidates {
|
|
h := netloc(c.url)
|
|
domainItems[h] = append(domainItems[h], c)
|
|
}
|
|
|
|
h := &domainHeap{}
|
|
heap.Init(h)
|
|
domainMul := make(map[string]float64)
|
|
|
|
for domain, items := range domainItems {
|
|
domainMul[domain] = 1.0
|
|
// Sort items within domain
|
|
sort.Slice(items, func(i, j int) bool {
|
|
return items[i].scoreVec[0] < items[j].scoreVec[0]
|
|
})
|
|
top := items[len(items)-1]
|
|
domainItems[domain] = items[:len(items)-1]
|
|
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
|
|
}
|
|
|
|
var result []candidate
|
|
for h.Len() > 0 && len(result) < to {
|
|
item := heap.Pop(h).(rerankItem)
|
|
if len(result) >= from {
|
|
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
|
}
|
|
domain := netloc(item.url)
|
|
domainMul[domain] /= 8
|
|
remaining := domainItems[domain]
|
|
if len(remaining) > 0 {
|
|
next := remaining[len(remaining)-1]
|
|
domainItems[domain] = remaining[:len(remaining)-1]
|
|
heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// ---- misc ----
|
|
|
|
func readBodyLimited(resp *http.Response, limit int64) string {
|
|
data := make([]byte, 0, limit)
|
|
buf := make([]byte, 4096)
|
|
var total int64
|
|
for {
|
|
n, err := resp.Body.Read(buf)
|
|
if n > 0 {
|
|
data = append(data, buf[:n]...)
|
|
total += int64(n)
|
|
if total >= limit {
|
|
break
|
|
}
|
|
}
|
|
if err != nil {
|
|
break
|
|
}
|
|
}
|
|
return string(data)
|
|
}
|
|
|
|
func truncate(s string, n int) string {
|
|
if len(s) <= n {
|
|
return s
|
|
}
|
|
return s[:n]
|
|
}
|
|
|
|
func unescapeURL(u string) string {
|
|
decoded, err := url.PathUnescape(u)
|
|
if err != nil {
|
|
return u
|
|
}
|
|
return decoded
|
|
}
|
|
|
|
func atoi(s string) int {
|
|
n := 0
|
|
for _, c := range s {
|
|
if c < '0' || c > '9' {
|
|
return n
|
|
}
|
|
n = n*10 + int(c-'0')
|
|
}
|
|
return n
|
|
}
|
|
|
|
func max(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
func min(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|