// Package backlink computes backlink (prosperity) scores for all known domains, // using a PageRank-like algorithm over the site-level link graph. // // It runs every 48 hours and writes savedata/prosper.json. package backlink import ( "encoding/json" "log" "math" "math/rand" "os" "path/filepath" "strings" "time" "sese-engine/storage" ) // Runner runs the backlink calculation loop. type Runner struct { db *storage.DB storagePath string } // New creates a Runner. func New(db *storage.DB, storagePath string) *Runner { return &Runner{db: db, storagePath: storagePath} } // Run loops forever, recalculating every 48 hours. func (r *Runner) Run() { for { // Sleep until next scheduled run (aligned to 2am) now := time.Now() target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location()) if !target.After(now) { target = target.Add(48 * time.Hour) } sleep := target.Sub(now) log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute)) time.Sleep(sleep) log.Printf("[backlink] starting computation at %v", time.Now().Format(time.RFC3339)) if err := r.compute(); err != nil { log.Printf("[backlink] error: %v", err) } else { log.Printf("[backlink] done") } } } // RunNow runs one computation cycle immediately (for testing / manual trigger). func (r *Runner) RunNow() error { return r.compute() } // ---- computation ---- type siteStats struct { subdomainCount map[string]int // superDomain → count templateCount map[string]int // htmlStructure → count sameIPCount map[string]int // ipPrefix → count serverCount map[string]int // serverType → count } func (r *Runner) compute() error { stats := r.collectStats() // Phase 1: HTTPS sites d1 := r.aggregate(func(info *storage.SiteInfo) bool { return info.HTTPSAvailable != nil && *info.HTTPSAvailable }, stats, "https_backlink") // Phase 1a: second pass (echo) using d1 scores d1a := r.aggregateWithScores(d1, stats, "echo") // Phase 2: HTTP-only sites d2 := r.aggregate(func(info *storage.SiteInfo) bool { return info.HTTPSAvailable == nil || !*info.HTTPSAvailable }, stats, "http_backlink") // Merge merged := make(map[string]float64) for k := range union(d1, d2, d1a) { v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k]) if v > 0.16 { merged[k] = v } } // Save path := filepath.Join(r.storagePath, "prosper.json") if err := writeJSON(path, merged); err != nil { return err } log.Printf("[backlink] wrote %d entries to %s", len(merged), path) return nil } // collectStats builds statistics about the site graph. func (r *Runner) collectStats() *siteStats { stats := &siteStats{ subdomainCount: make(map[string]int), templateCount: make(map[string]int), sameIPCount: make(map[string]int), serverCount: make(map[string]int), } _ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error { super := superDomain(host) stats.subdomainCount[super]++ if info.HTMLStructure != "" { stats.templateCount[info.HTMLStructure]++ } if len(info.IPs) > 0 { ipStr := ipPrefix(info.IPs) stats.sameIPCount[ipStr]++ } if len(info.ServerTypes) > 0 { s := strings.Join(sortedStrings(info.ServerTypes), ",") stats.serverCount[s]++ } return nil }) // Prune counts below threshold for k, v := range stats.subdomainCount { if v < 4 { delete(stats.subdomainCount, k) } } for k, v := range stats.templateCount { if v < 4 { delete(stats.templateCount, k) } } for k, v := range stats.sameIPCount { if v < 4 { delete(stats.sameIPCount, k) } } return stats } // aggregate computes a backlink score map for sites matching the filter. func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 { log.Printf("[backlink] aggregating: %s", desc) d := make(map[string]float64) ipSource := make(map[string]float64) // Build server type index (top 63 most common) serverTable := buildServerTable(stats.serverCount) type vectorEntry struct { domain string vec []float32 } vectors := make(map[string][]float32) pruneThreshold := 0.02 i := 0 _ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error { if filter != nil && !filter(info) { return nil } mul := computeMul(host, info, stats) if mul == 0 { return nil } n := len(info.OutLinks) if n == 0 { return nil } w := 1.0 / math.Max(float64(n), 50) xd := make(map[string]float64) for _, link := range info.OutLinks { for _, seg := range decomposeURL(link) { if _, exists := xd[seg]; !exists { xd[seg] = w } else { xd[seg] += w } } } ipStr := ipPrefix(info.IPs) serverType := "" if len(info.ServerTypes) > 0 { serverType = info.ServerTypes[0] } serverID := serverTable[serverType] for seg, segW := range xd { fw := math.Min(segW, 0.15) * mul prev := d[seg] d[seg] = prev + fw if prev > 0.2 { if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP { key := seg + "-" + ipStr if ipSource[key] > 0.4 { continue } ipSource[key] += fw } } if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" { if vectors[seg] == nil { vectors[seg] = make([]float32, 64) } vectors[seg][serverID] += float32(fw) } } i++ if i%200000 == 0 { // Prune low-score entries for k, v := range d { if v < pruneThreshold { delete(d, k) } } pruneThreshold *= 1.1 } if i%400000 == 0 { for k, v := range ipSource { if v < 0.04 { delete(ipSource, k) } } } return nil }) // Vectorised cosine filtering d = vectorFilter(d, vectors, desc) // Prune for k, v := range d { if v <= 0.16 { delete(d, k) } } log.Printf("[backlink] %s: %d entries", desc, len(d)) return d } // aggregateWithScores does a second pass weighted by existing scores. func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 { log.Printf("[backlink] aggregating with scores: %s", desc) d := make(map[string]float64) serverTable := buildServerTable(stats.serverCount) vectors := make(map[string][]float32) _ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error { score, ok := scores[host] if !ok || strings.Contains(host, "/") { return nil } mul := computeMul(host, info, stats) if mul == 0 { return nil } trueMul := math.Min(2, mul*math.Log2(2+score)) n := len(info.OutLinks) if n == 0 { return nil } w := 1.0 / math.Max(float64(n), 50) xd := make(map[string]float64) for _, link := range info.OutLinks { for _, seg := range decomposeURL(link) { xd[seg] += w } } serverType := "" if len(info.ServerTypes) > 0 { serverType = info.ServerTypes[0] } serverID := serverTable[serverType] for seg, segW := range xd { fw := math.Min(segW, 0.15) * trueMul d[seg] += fw if d[seg] > 0.21 && !strings.Contains(seg, "/") && serverType != "" { if vectors[seg] == nil { vectors[seg] = make([]float32, 64) } vectors[seg][serverID] += float32(fw) } } return nil }) d = vectorFilter(d, vectors, desc) for k, v := range d { if v <= 0.16 { delete(d, k) } } return d } // ---- vector cosine filtering ---- func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 { // Compute core vector (sum of all) core := make([]float64, 64) for _, vec := range vectors { for j, v := range vec { core[j] += float64(v) } } coreNorm := norm64(core) if coreNorm == 0 { return d } newD := make(map[string]float64, len(d)) for k, v := range d { baseK := strings.Split(k, "/")[0] if v > 0.21 && vectors[baseK] != nil { vec := vectors[baseK] vecNorm := float64(norm32(vec)) if vecNorm == 0 { newD[k] = v continue } cos := dot32_64(vec, core) / (vecNorm * coreNorm) if cos > 1.01 { cos = 1.01 } newV := math.Max(v*(0.25+cos*0.75), 0.21) newD[k] = newV } else { newD[k] = v } } // Save cos map for diagnostics cosMap := make(map[string]float64) for k, vec := range vectors { vn := float64(norm32(vec)) if vn > 0 { cosMap[k] = dot32_64(vec, core) / (vn * coreNorm) } } _ = writeJSON(desc+"_cos.json", cosMap) return newD } // ---- helpers ---- func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 { if len(info.OutLinks) == 0 { return 0 } t := info.LastVisitTime if t == 0 { t = 1640000000 } days := (time.Now().Unix() - t) / (3600 * 24) if days > 180 { return 0 } timeMul := math.Pow(0.99, float64(days)) super := superDomain(host) subCount := max(stats.subdomainCount[super], 1) tplCount := 1 if info.HTMLStructure != "" { tplCount = max(stats.templateCount[info.HTMLStructure], 1) } count := max(subCount, int(float64(tplCount)*1.5)) if count > 1000 { if rand.Float64() > 1000.0/float64(count) { return 0 } count = 1000 } domainMul := 1.0 / math.Pow(math.Max(float64(count), 5)/5, 0.6) return timeMul * domainMul } func superDomain(host string) string { parts := strings.Split(host, ".") if len(parts) >= 2 { return strings.Join(parts[len(parts)-2:], ".") } return host } func ipPrefix(ips []string) string { if len(ips) == 0 { return "" } sorted := sortedStrings(ips) parts := make([]string, len(sorted)) for i, ip := range sorted { idx := strings.LastIndex(ip, ".") if idx > 0 { parts[i] = ip[:idx] } else { parts[i] = ip } } return strings.Join(parts, ",") } func decomposeURL(rawURL string) []string { u := strings.ToLower(rawURL) if strings.HasPrefix(u, "https://") { u = u[8:] } else if strings.HasPrefix(u, "http://") { u = u[7:] } else { return nil } u = strings.ReplaceAll(u, "?", "/") u = strings.ReplaceAll(u, "#", "/") u = strings.TrimRight(u, "/") if u == "" || u[0] == '/' || u[0] == '%' { return nil } parts := strings.Split(u, "/") var out []string current := parts[0] out = append(out, current) for _, p := range parts[1:] { current = current + "/" + p out = append(out, current) } return out } func buildServerTable(serverCount map[string]int) map[string]int { type kv struct { k string v int } var sorted []kv for k, v := range serverCount { sorted = append(sorted, kv{k, v}) } for i := 0; i < len(sorted)-1; i++ { for j := i + 1; j < len(sorted); j++ { if sorted[j].v > sorted[i].v { sorted[i], sorted[j] = sorted[j], sorted[i] } } } table := make(map[string]int, 63) limit := 63 if len(sorted) < limit { limit = len(sorted) } for i := 0; i < limit; i++ { table[sorted[i].k] = i + 1 } return table } func sortedStrings(s []string) []string { cp := make([]string, len(s)) copy(cp, s) for i := 0; i < len(cp)-1; i++ { for j := i + 1; j < len(cp); j++ { if cp[j] < cp[i] { cp[i], cp[j] = cp[j], cp[i] } } } return cp } func norm64(v []float64) float64 { s := 0.0 for _, x := range v { s += x * x } return math.Sqrt(s) } func norm32(v []float32) float32 { s := float32(0) for _, x := range v { s += x * x } return float32(math.Sqrt(float64(s))) } func dot32_64(a []float32, b []float64) float64 { s := 0.0 for i := range a { s += float64(a[i]) * b[i] } return s } func union(maps ...map[string]float64) map[string]bool { out := make(map[string]bool) for _, m := range maps { for k := range m { out[k] = true } } return out } func writeJSON(path string, data interface{}) error { _ = os.MkdirAll(filepath.Dir(path), 0o755) b, err := json.MarshalIndent(data, "", " ") if err != nil { return err } return os.WriteFile(path, b, 0o644) } func max(a, b int) int { if a > b { return a } return b }