Signed-off-by: 吴文峰 <kevin@lmve.net>

This commit is contained in:
2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
+533
View File
@@ -0,0 +1,533 @@
// Package backlink computes backlink (prosperity) scores for all known domains,
// using a PageRank-like algorithm over the site-level link graph.
//
// It runs every 48 hours and writes savedata/prosper.json.
package backlink
import (
"encoding/json"
"log"
"math"
"math/rand"
"os"
"path/filepath"
"strings"
"time"
"sese-engine/storage"
)
// Runner runs the backlink calculation loop.
type Runner struct {
db *storage.DB
storagePath string
}
// New creates a Runner.
func New(db *storage.DB, storagePath string) *Runner {
return &Runner{db: db, storagePath: storagePath}
}
// Run loops forever, recalculating every 48 hours.
func (r *Runner) Run() {
for {
// Sleep until next scheduled run (aligned to 2am)
now := time.Now()
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
if !target.After(now) {
target = target.Add(48 * time.Hour)
}
sleep := target.Sub(now)
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
time.Sleep(sleep)
log.Printf("[backlink] starting computation at %v", time.Now().Format(time.RFC3339))
if err := r.compute(); err != nil {
log.Printf("[backlink] error: %v", err)
} else {
log.Printf("[backlink] done")
}
}
}
// RunNow runs one computation cycle immediately (for testing / manual trigger).
func (r *Runner) RunNow() error {
return r.compute()
}
// ---- computation ----
type siteStats struct {
subdomainCount map[string]int // superDomain → count
templateCount map[string]int // htmlStructure → count
sameIPCount map[string]int // ipPrefix → count
serverCount map[string]int // serverType → count
}
func (r *Runner) compute() error {
stats := r.collectStats()
// Phase 1: HTTPS sites
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
}, stats, "https_backlink")
// Phase 1a: second pass (echo) using d1 scores
d1a := r.aggregateWithScores(d1, stats, "echo")
// Phase 2: HTTP-only sites
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
}, stats, "http_backlink")
// Merge
merged := make(map[string]float64)
for k := range union(d1, d2, d1a) {
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
if v > 0.16 {
merged[k] = v
}
}
// Save
path := filepath.Join(r.storagePath, "prosper.json")
if err := writeJSON(path, merged); err != nil {
return err
}
log.Printf("[backlink] wrote %d entries to %s", len(merged), path)
return nil
}
// collectStats builds statistics about the site graph.
func (r *Runner) collectStats() *siteStats {
stats := &siteStats{
subdomainCount: make(map[string]int),
templateCount: make(map[string]int),
sameIPCount: make(map[string]int),
serverCount: make(map[string]int),
}
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
super := superDomain(host)
stats.subdomainCount[super]++
if info.HTMLStructure != "" {
stats.templateCount[info.HTMLStructure]++
}
if len(info.IPs) > 0 {
ipStr := ipPrefix(info.IPs)
stats.sameIPCount[ipStr]++
}
if len(info.ServerTypes) > 0 {
s := strings.Join(sortedStrings(info.ServerTypes), ",")
stats.serverCount[s]++
}
return nil
})
// Prune counts below threshold
for k, v := range stats.subdomainCount {
if v < 4 {
delete(stats.subdomainCount, k)
}
}
for k, v := range stats.templateCount {
if v < 4 {
delete(stats.templateCount, k)
}
}
for k, v := range stats.sameIPCount {
if v < 4 {
delete(stats.sameIPCount, k)
}
}
return stats
}
// aggregate computes a backlink score map for sites matching the filter.
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
log.Printf("[backlink] aggregating: %s", desc)
d := make(map[string]float64)
ipSource := make(map[string]float64)
// Build server type index (top 63 most common)
serverTable := buildServerTable(stats.serverCount)
type vectorEntry struct {
domain string
vec []float32
}
vectors := make(map[string][]float32)
pruneThreshold := 0.02
i := 0
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
if filter != nil && !filter(info) {
return nil
}
mul := computeMul(host, info, stats)
if mul == 0 {
return nil
}
n := len(info.OutLinks)
if n == 0 {
return nil
}
w := 1.0 / math.Max(float64(n), 50)
xd := make(map[string]float64)
for _, link := range info.OutLinks {
for _, seg := range decomposeURL(link) {
if _, exists := xd[seg]; !exists {
xd[seg] = w
} else {
xd[seg] += w
}
}
}
ipStr := ipPrefix(info.IPs)
serverType := ""
if len(info.ServerTypes) > 0 {
serverType = info.ServerTypes[0]
}
serverID := serverTable[serverType]
for seg, segW := range xd {
fw := math.Min(segW, 0.15) * mul
prev := d[seg]
d[seg] = prev + fw
if prev > 0.2 {
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
key := seg + "-" + ipStr
if ipSource[key] > 0.4 {
continue
}
ipSource[key] += fw
}
}
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
if vectors[seg] == nil {
vectors[seg] = make([]float32, 64)
}
vectors[seg][serverID] += float32(fw)
}
}
i++
if i%200000 == 0 {
// Prune low-score entries
for k, v := range d {
if v < pruneThreshold {
delete(d, k)
}
}
pruneThreshold *= 1.1
}
if i%400000 == 0 {
for k, v := range ipSource {
if v < 0.04 {
delete(ipSource, k)
}
}
}
return nil
})
// Vectorised cosine filtering
d = vectorFilter(d, vectors, desc)
// Prune
for k, v := range d {
if v <= 0.16 {
delete(d, k)
}
}
log.Printf("[backlink] %s: %d entries", desc, len(d))
return d
}
// aggregateWithScores does a second pass weighted by existing scores.
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
log.Printf("[backlink] aggregating with scores: %s", desc)
d := make(map[string]float64)
serverTable := buildServerTable(stats.serverCount)
vectors := make(map[string][]float32)
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
score, ok := scores[host]
if !ok || strings.Contains(host, "/") {
return nil
}
mul := computeMul(host, info, stats)
if mul == 0 {
return nil
}
trueMul := math.Min(2, mul*math.Log2(2+score))
n := len(info.OutLinks)
if n == 0 {
return nil
}
w := 1.0 / math.Max(float64(n), 50)
xd := make(map[string]float64)
for _, link := range info.OutLinks {
for _, seg := range decomposeURL(link) {
xd[seg] += w
}
}
serverType := ""
if len(info.ServerTypes) > 0 {
serverType = info.ServerTypes[0]
}
serverID := serverTable[serverType]
for seg, segW := range xd {
fw := math.Min(segW, 0.15) * trueMul
d[seg] += fw
if d[seg] > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
if vectors[seg] == nil {
vectors[seg] = make([]float32, 64)
}
vectors[seg][serverID] += float32(fw)
}
}
return nil
})
d = vectorFilter(d, vectors, desc)
for k, v := range d {
if v <= 0.16 {
delete(d, k)
}
}
return d
}
// ---- vector cosine filtering ----
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
// Compute core vector (sum of all)
core := make([]float64, 64)
for _, vec := range vectors {
for j, v := range vec {
core[j] += float64(v)
}
}
coreNorm := norm64(core)
if coreNorm == 0 {
return d
}
newD := make(map[string]float64, len(d))
for k, v := range d {
baseK := strings.Split(k, "/")[0]
if v > 0.21 && vectors[baseK] != nil {
vec := vectors[baseK]
vecNorm := float64(norm32(vec))
if vecNorm == 0 {
newD[k] = v
continue
}
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
if cos > 1.01 {
cos = 1.01
}
newV := math.Max(v*(0.25+cos*0.75), 0.21)
newD[k] = newV
} else {
newD[k] = v
}
}
// Save cos map for diagnostics
cosMap := make(map[string]float64)
for k, vec := range vectors {
vn := float64(norm32(vec))
if vn > 0 {
cosMap[k] = dot32_64(vec, core) / (vn * coreNorm)
}
}
_ = writeJSON(desc+"_cos.json", cosMap)
return newD
}
// ---- helpers ----
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
if len(info.OutLinks) == 0 {
return 0
}
t := info.LastVisitTime
if t == 0 {
t = 1640000000
}
days := (time.Now().Unix() - t) / (3600 * 24)
if days > 180 {
return 0
}
timeMul := math.Pow(0.99, float64(days))
super := superDomain(host)
subCount := max(stats.subdomainCount[super], 1)
tplCount := 1
if info.HTMLStructure != "" {
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
}
count := max(subCount, int(float64(tplCount)*1.5))
if count > 1000 {
if rand.Float64() > 1000.0/float64(count) {
return 0
}
count = 1000
}
domainMul := 1.0 / math.Pow(math.Max(float64(count), 5)/5, 0.6)
return timeMul * domainMul
}
func superDomain(host string) string {
parts := strings.Split(host, ".")
if len(parts) >= 2 {
return strings.Join(parts[len(parts)-2:], ".")
}
return host
}
func ipPrefix(ips []string) string {
if len(ips) == 0 {
return ""
}
sorted := sortedStrings(ips)
parts := make([]string, len(sorted))
for i, ip := range sorted {
idx := strings.LastIndex(ip, ".")
if idx > 0 {
parts[i] = ip[:idx]
} else {
parts[i] = ip
}
}
return strings.Join(parts, ",")
}
func decomposeURL(rawURL string) []string {
u := strings.ToLower(rawURL)
if strings.HasPrefix(u, "https://") {
u = u[8:]
} else if strings.HasPrefix(u, "http://") {
u = u[7:]
} else {
return nil
}
u = strings.ReplaceAll(u, "?", "/")
u = strings.ReplaceAll(u, "#", "/")
u = strings.TrimRight(u, "/")
if u == "" || u[0] == '/' || u[0] == '%' {
return nil
}
parts := strings.Split(u, "/")
var out []string
current := parts[0]
out = append(out, current)
for _, p := range parts[1:] {
current = current + "/" + p
out = append(out, current)
}
return out
}
func buildServerTable(serverCount map[string]int) map[string]int {
type kv struct {
k string
v int
}
var sorted []kv
for k, v := range serverCount {
sorted = append(sorted, kv{k, v})
}
for i := 0; i < len(sorted)-1; i++ {
for j := i + 1; j < len(sorted); j++ {
if sorted[j].v > sorted[i].v {
sorted[i], sorted[j] = sorted[j], sorted[i]
}
}
}
table := make(map[string]int, 63)
limit := 63
if len(sorted) < limit {
limit = len(sorted)
}
for i := 0; i < limit; i++ {
table[sorted[i].k] = i + 1
}
return table
}
func sortedStrings(s []string) []string {
cp := make([]string, len(s))
copy(cp, s)
for i := 0; i < len(cp)-1; i++ {
for j := i + 1; j < len(cp); j++ {
if cp[j] < cp[i] {
cp[i], cp[j] = cp[j], cp[i]
}
}
}
return cp
}
func norm64(v []float64) float64 {
s := 0.0
for _, x := range v {
s += x * x
}
return math.Sqrt(s)
}
func norm32(v []float32) float32 {
s := float32(0)
for _, x := range v {
s += x * x
}
return float32(math.Sqrt(float64(s)))
}
func dot32_64(a []float32, b []float64) float64 {
s := 0.0
for i := range a {
s += float64(a[i]) * b[i]
}
return s
}
func union(maps ...map[string]float64) map[string]bool {
out := make(map[string]bool)
for _, m := range maps {
for k := range m {
out[k] = true
}
}
return out
}
func writeJSON(path string, data interface{}) error {
_ = os.MkdirAll(filepath.Dir(path), 0o755)
b, err := json.MarshalIndent(data, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, b, 0o644)
}
func max(a, b int) int {
if a > b {
return a
}
return b
}