Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1,327 @@
|
||||
// Package harvester implements the index-writing server (port 5000).
|
||||
//
|
||||
// It receives (url, keywords) payloads from the crawler, accumulates them in
|
||||
// memory, then flushes to the persistent inverted index when the in-memory
|
||||
// row count exceeds the configured threshold.
|
||||
package harvester
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"sese-engine/config"
|
||||
"sese-engine/info"
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
// Server is the harvester HTTP server.
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
|
||||
// in-memory accumulator: keyword → [(weight, url)]
|
||||
mem map[string][]storage.IndexEntry
|
||||
memMu sync.Mutex
|
||||
|
||||
rowCount int64 // approximate total in-memory rows
|
||||
flushMu sync.Mutex // only one flush at a time
|
||||
|
||||
infoSvc *info.Service
|
||||
}
|
||||
|
||||
// New creates a harvester Server.
|
||||
func New(db *storage.DB, infoSvc *info.Service) *Server {
|
||||
return &Server{
|
||||
db: db,
|
||||
mem: make(map[string][]storage.IndexEntry),
|
||||
infoSvc: infoSvc,
|
||||
}
|
||||
}
|
||||
|
||||
// ingestPayload is the JSON body sent by the crawler.
|
||||
type ingestPayload struct {
|
||||
URL string `json:"url"`
|
||||
Keywords []struct {
|
||||
Word string `json:"word"`
|
||||
Weight float32 `json:"weight"`
|
||||
} `json:"keywords"`
|
||||
}
|
||||
|
||||
// Handler returns the http.Handler for the harvester.
|
||||
func (s *Server) Handler() http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/l", s.handleIngest)
|
||||
return mux
|
||||
}
|
||||
|
||||
func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
var payload ingestPayload
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
http.Error(w, "bad json: "+err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Sanitise URL
|
||||
payload.URL = strings.ReplaceAll(payload.URL, "\n", "")
|
||||
if payload.URL == "" {
|
||||
http.Error(w, "empty url", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
s.memMu.Lock()
|
||||
for _, kw := range payload.Keywords {
|
||||
key := kw.Word
|
||||
entries := s.mem[key]
|
||||
|
||||
// Threshold-based early discard
|
||||
if len(entries) > 15 {
|
||||
low := s.lowThreshold(key)
|
||||
if float64(kw.Weight) < low {
|
||||
continue
|
||||
}
|
||||
}
|
||||
s.mem[key] = append(entries, storage.IndexEntry{
|
||||
Weight: kw.Weight,
|
||||
URL: payload.URL,
|
||||
})
|
||||
atomic.AddInt64(&s.rowCount, 1)
|
||||
}
|
||||
s.memMu.Unlock()
|
||||
|
||||
// Check if we should flush
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
||||
go s.flush()
|
||||
}
|
||||
|
||||
w.Write([]byte("ok"))
|
||||
}
|
||||
|
||||
// lowThreshold returns the minimum weight needed to enter the index for key.
|
||||
func (s *Server) lowThreshold(key string) float64 {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
if len(existing) < config.MaxURLsPerKey {
|
||||
return -1
|
||||
}
|
||||
// Find the config.MaxURLsPerKey-th highest weight
|
||||
weights := make([]float64, len(existing))
|
||||
for i, e := range existing {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
// Partial sort: find threshold at position MaxURLsPerKey-1
|
||||
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
||||
}
|
||||
|
||||
// flush merges the in-memory accumulator into the persistent index.
|
||||
func (s *Server) flush() {
|
||||
if !s.flushMu.TryLock() {
|
||||
return // another flush is running
|
||||
}
|
||||
defer s.flushMu.Unlock()
|
||||
|
||||
s.memMu.Lock()
|
||||
snapshot := s.mem
|
||||
s.mem = make(map[string][]storage.IndexEntry)
|
||||
atomic.StoreInt64(&s.rowCount, 0)
|
||||
s.memMu.Unlock()
|
||||
|
||||
log.Printf("[harvester] flushing %d keys", len(snapshot))
|
||||
|
||||
items := make([]struct {
|
||||
key string
|
||||
entries []storage.IndexEntry
|
||||
}, 0, len(snapshot))
|
||||
for k, v := range snapshot {
|
||||
items = append(items, struct {
|
||||
key string
|
||||
entries []storage.IndexEntry
|
||||
}{k, v})
|
||||
}
|
||||
rand.Shuffle(len(items), func(i, j int) { items[i], items[j] = items[j], items[i] })
|
||||
|
||||
// Parallel merge
|
||||
type result struct {
|
||||
key string
|
||||
entries []storage.IndexEntry
|
||||
}
|
||||
results := make(chan result, len(items))
|
||||
sem := make(chan struct{}, 8)
|
||||
|
||||
for _, item := range items {
|
||||
sem <- struct{}{}
|
||||
go func(k string, newEntries []storage.IndexEntry) {
|
||||
defer func() { <-sem }()
|
||||
merged := s.mergeKey(k, newEntries)
|
||||
results <- result{k, merged}
|
||||
}(item.key, item.entries)
|
||||
}
|
||||
|
||||
// Collect
|
||||
batch := make(map[string][]storage.IndexEntry, len(items))
|
||||
for range items {
|
||||
r := <-results
|
||||
batch[r.key] = r.entries
|
||||
}
|
||||
|
||||
if err := s.db.BatchSetIndex(batch); err != nil {
|
||||
log.Printf("[harvester] flush write error: %v", err)
|
||||
}
|
||||
log.Printf("[harvester] flush done, %d keys written", len(batch))
|
||||
}
|
||||
|
||||
// mergeKey merges new entries with existing index entries for a key.
|
||||
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
|
||||
// Discard new key if too few URLs
|
||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
||||
return nil
|
||||
}
|
||||
|
||||
merged := dedup(append(newEntries, existing...))
|
||||
|
||||
// Occasional URL normalisation dedup
|
||||
if rand.Float64() < 0.02 {
|
||||
merged = dedupNormalised(merged)
|
||||
}
|
||||
|
||||
// Trim if over limit
|
||||
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
||||
}
|
||||
|
||||
return merged
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
|
||||
func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
|
||||
seen := make(map[string]bool, len(entries))
|
||||
out := make([]storage.IndexEntry, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
if seen[e.URL] {
|
||||
continue
|
||||
}
|
||||
seen[e.URL] = true
|
||||
out = append(out, e)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
|
||||
// Sort by URL length descending, then dedup by normalised URL (strip scheme, trailing slash)
|
||||
sorted := make([]storage.IndexEntry, len(entries))
|
||||
copy(sorted, entries)
|
||||
for i := 0; i < len(sorted)-1; i++ {
|
||||
for j := i + 1; j < len(sorted); j++ {
|
||||
if len(sorted[j].URL) > len(sorted[i].URL) {
|
||||
sorted[i], sorted[j] = sorted[j], sorted[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
seen := make(map[string]bool)
|
||||
out := make([]storage.IndexEntry, 0, len(sorted))
|
||||
for _, e := range sorted {
|
||||
k := normaliseURL(e.URL)
|
||||
if seen[k] {
|
||||
continue
|
||||
}
|
||||
seen[k] = true
|
||||
out = append(out, e)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func normaliseURL(u string) string {
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
u = u[8:]
|
||||
} else if strings.HasPrefix(u, "http://") {
|
||||
u = u[7:]
|
||||
}
|
||||
return strings.TrimRight(u, "/")
|
||||
}
|
||||
|
||||
// trim reduces entries to at most limit, keeping at most sameDomainLimit per domain.
|
||||
func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomainLimit int) []storage.IndexEntry {
|
||||
// Sort by effective score: weight * (1 + backlink)
|
||||
scored := make([]storage.IndexEntry, len(entries))
|
||||
copy(scored, entries)
|
||||
for i := 0; i < len(scored)-1; i++ {
|
||||
for j := i + 1; j < len(scored); j++ {
|
||||
si := float64(scored[i].Weight) * (1 + infoSvc.Prosper(scored[i].URL))
|
||||
sj := float64(scored[j].Weight) * (1 + infoSvc.Prosper(scored[j].URL))
|
||||
if sj > si {
|
||||
scored[i], scored[j] = scored[j], scored[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Per-domain cap
|
||||
domainCount := make(map[string]int)
|
||||
out := make([]storage.IndexEntry, 0, limit)
|
||||
for _, e := range scored {
|
||||
host := netloc(e.URL)
|
||||
if host == "" {
|
||||
host = e.URL
|
||||
}
|
||||
host = strings.ToLower(host)
|
||||
// Allow homepage URLs regardless of limit
|
||||
isHome := isHomepage(e.URL)
|
||||
if !isHome && domainCount[host] >= sameDomainLimit {
|
||||
continue
|
||||
}
|
||||
domainCount[host]++
|
||||
out = append(out, e)
|
||||
if len(out) >= limit {
|
||||
break
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isHomepage(u string) bool {
|
||||
u = strings.TrimPrefix(u, "https://")
|
||||
u = strings.TrimPrefix(u, "http://")
|
||||
return strings.Count(strings.TrimRight(u, "/"), "/") == 0
|
||||
}
|
||||
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
return parts[2]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// nthLargest returns the n-th largest value in a slice (0-indexed).
|
||||
func nthLargest(values []float64, n int) float64 {
|
||||
if n >= len(values) {
|
||||
return 0
|
||||
}
|
||||
cp := make([]float64, len(values))
|
||||
copy(cp, values)
|
||||
// Partial sort descending
|
||||
for i := 0; i <= n; i++ {
|
||||
maxIdx := i
|
||||
for j := i + 1; j < len(cp); j++ {
|
||||
if cp[j] > cp[maxIdx] {
|
||||
maxIdx = j
|
||||
}
|
||||
}
|
||||
cp[i], cp[maxIdx] = cp[maxIdx], cp[i]
|
||||
}
|
||||
return cp[n]
|
||||
}
|
||||
|
||||
// ListenAndServe starts the harvester on the given address.
|
||||
func (s *Server) ListenAndServe(addr string) error {
|
||||
log.Printf("[harvester] listening on %s", addr)
|
||||
return http.ListenAndServe(addr, s.Handler())
|
||||
}
|
||||
Reference in New Issue
Block a user