534 lines
12 KiB
Go
534 lines
12 KiB
Go
// Package backlink computes backlink (prosperity) scores for all known domains,
|
|
// using a PageRank-like algorithm over the site-level link graph.
|
|
//
|
|
// It runs every 48 hours and writes savedata/prosper.json.
|
|
package backlink
|
|
|
|
import (
|
|
"encoding/json"
|
|
"log"
|
|
"math"
|
|
"math/rand"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"sese-engine/storage"
|
|
)
|
|
|
|
// Runner runs the backlink calculation loop.
|
|
type Runner struct {
|
|
db *storage.DB
|
|
storagePath string
|
|
}
|
|
|
|
// New creates a Runner.
|
|
func New(db *storage.DB, storagePath string) *Runner {
|
|
return &Runner{db: db, storagePath: storagePath}
|
|
}
|
|
|
|
// Run loops forever, recalculating every 48 hours.
|
|
func (r *Runner) Run() {
|
|
for {
|
|
// Sleep until next scheduled run (aligned to 2am)
|
|
now := time.Now()
|
|
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
|
|
if !target.After(now) {
|
|
target = target.Add(48 * time.Hour)
|
|
}
|
|
sleep := target.Sub(now)
|
|
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
|
|
time.Sleep(sleep)
|
|
|
|
log.Printf("[backlink] starting computation at %v", time.Now().Format(time.RFC3339))
|
|
if err := r.compute(); err != nil {
|
|
log.Printf("[backlink] error: %v", err)
|
|
} else {
|
|
log.Printf("[backlink] done")
|
|
}
|
|
}
|
|
}
|
|
|
|
// RunNow runs one computation cycle immediately (for testing / manual trigger).
|
|
func (r *Runner) RunNow() error {
|
|
return r.compute()
|
|
}
|
|
|
|
// ---- computation ----
|
|
|
|
type siteStats struct {
|
|
subdomainCount map[string]int // superDomain → count
|
|
templateCount map[string]int // htmlStructure → count
|
|
sameIPCount map[string]int // ipPrefix → count
|
|
serverCount map[string]int // serverType → count
|
|
}
|
|
|
|
func (r *Runner) compute() error {
|
|
stats := r.collectStats()
|
|
|
|
// Phase 1: HTTPS sites
|
|
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
|
|
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
|
|
}, stats, "https_backlink")
|
|
|
|
// Phase 1a: second pass (echo) using d1 scores
|
|
d1a := r.aggregateWithScores(d1, stats, "echo")
|
|
|
|
// Phase 2: HTTP-only sites
|
|
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
|
|
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
|
|
}, stats, "http_backlink")
|
|
|
|
// Merge
|
|
merged := make(map[string]float64)
|
|
for k := range union(d1, d2, d1a) {
|
|
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
|
|
if v > 0.16 {
|
|
merged[k] = v
|
|
}
|
|
}
|
|
|
|
// Save
|
|
path := filepath.Join(r.storagePath, "prosper.json")
|
|
if err := writeJSON(path, merged); err != nil {
|
|
return err
|
|
}
|
|
log.Printf("[backlink] wrote %d entries to %s", len(merged), path)
|
|
return nil
|
|
}
|
|
|
|
// collectStats builds statistics about the site graph.
|
|
func (r *Runner) collectStats() *siteStats {
|
|
stats := &siteStats{
|
|
subdomainCount: make(map[string]int),
|
|
templateCount: make(map[string]int),
|
|
sameIPCount: make(map[string]int),
|
|
serverCount: make(map[string]int),
|
|
}
|
|
|
|
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
|
super := superDomain(host)
|
|
stats.subdomainCount[super]++
|
|
|
|
if info.HTMLStructure != "" {
|
|
stats.templateCount[info.HTMLStructure]++
|
|
}
|
|
if len(info.IPs) > 0 {
|
|
ipStr := ipPrefix(info.IPs)
|
|
stats.sameIPCount[ipStr]++
|
|
}
|
|
if len(info.ServerTypes) > 0 {
|
|
s := strings.Join(sortedStrings(info.ServerTypes), ",")
|
|
stats.serverCount[s]++
|
|
}
|
|
return nil
|
|
})
|
|
|
|
// Prune counts below threshold
|
|
for k, v := range stats.subdomainCount {
|
|
if v < 4 {
|
|
delete(stats.subdomainCount, k)
|
|
}
|
|
}
|
|
for k, v := range stats.templateCount {
|
|
if v < 4 {
|
|
delete(stats.templateCount, k)
|
|
}
|
|
}
|
|
for k, v := range stats.sameIPCount {
|
|
if v < 4 {
|
|
delete(stats.sameIPCount, k)
|
|
}
|
|
}
|
|
return stats
|
|
}
|
|
|
|
// aggregate computes a backlink score map for sites matching the filter.
|
|
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
|
|
log.Printf("[backlink] aggregating: %s", desc)
|
|
d := make(map[string]float64)
|
|
ipSource := make(map[string]float64)
|
|
|
|
// Build server type index (top 63 most common)
|
|
serverTable := buildServerTable(stats.serverCount)
|
|
|
|
type vectorEntry struct {
|
|
domain string
|
|
vec []float32
|
|
}
|
|
vectors := make(map[string][]float32)
|
|
|
|
pruneThreshold := 0.02
|
|
i := 0
|
|
|
|
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
|
if filter != nil && !filter(info) {
|
|
return nil
|
|
}
|
|
mul := computeMul(host, info, stats)
|
|
if mul == 0 {
|
|
return nil
|
|
}
|
|
|
|
n := len(info.OutLinks)
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
|
|
w := 1.0 / math.Max(float64(n), 50)
|
|
xd := make(map[string]float64)
|
|
for _, link := range info.OutLinks {
|
|
for _, seg := range decomposeURL(link) {
|
|
if _, exists := xd[seg]; !exists {
|
|
xd[seg] = w
|
|
} else {
|
|
xd[seg] += w
|
|
}
|
|
}
|
|
}
|
|
|
|
ipStr := ipPrefix(info.IPs)
|
|
serverType := ""
|
|
if len(info.ServerTypes) > 0 {
|
|
serverType = info.ServerTypes[0]
|
|
}
|
|
serverID := serverTable[serverType]
|
|
|
|
for seg, segW := range xd {
|
|
fw := math.Min(segW, 0.15) * mul
|
|
prev := d[seg]
|
|
d[seg] = prev + fw
|
|
|
|
if prev > 0.2 {
|
|
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
|
|
key := seg + "-" + ipStr
|
|
if ipSource[key] > 0.4 {
|
|
continue
|
|
}
|
|
ipSource[key] += fw
|
|
}
|
|
}
|
|
|
|
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
|
if vectors[seg] == nil {
|
|
vectors[seg] = make([]float32, 64)
|
|
}
|
|
vectors[seg][serverID] += float32(fw)
|
|
}
|
|
}
|
|
|
|
i++
|
|
if i%200000 == 0 {
|
|
// Prune low-score entries
|
|
for k, v := range d {
|
|
if v < pruneThreshold {
|
|
delete(d, k)
|
|
}
|
|
}
|
|
pruneThreshold *= 1.1
|
|
}
|
|
if i%400000 == 0 {
|
|
for k, v := range ipSource {
|
|
if v < 0.04 {
|
|
delete(ipSource, k)
|
|
}
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
|
|
// Vectorised cosine filtering
|
|
d = vectorFilter(d, vectors, desc)
|
|
|
|
// Prune
|
|
for k, v := range d {
|
|
if v <= 0.16 {
|
|
delete(d, k)
|
|
}
|
|
}
|
|
|
|
log.Printf("[backlink] %s: %d entries", desc, len(d))
|
|
return d
|
|
}
|
|
|
|
// aggregateWithScores does a second pass weighted by existing scores.
|
|
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
|
|
log.Printf("[backlink] aggregating with scores: %s", desc)
|
|
d := make(map[string]float64)
|
|
serverTable := buildServerTable(stats.serverCount)
|
|
vectors := make(map[string][]float32)
|
|
|
|
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
|
score, ok := scores[host]
|
|
if !ok || strings.Contains(host, "/") {
|
|
return nil
|
|
}
|
|
mul := computeMul(host, info, stats)
|
|
if mul == 0 {
|
|
return nil
|
|
}
|
|
trueMul := math.Min(2, mul*math.Log2(2+score))
|
|
|
|
n := len(info.OutLinks)
|
|
if n == 0 {
|
|
return nil
|
|
}
|
|
w := 1.0 / math.Max(float64(n), 50)
|
|
xd := make(map[string]float64)
|
|
for _, link := range info.OutLinks {
|
|
for _, seg := range decomposeURL(link) {
|
|
xd[seg] += w
|
|
}
|
|
}
|
|
serverType := ""
|
|
if len(info.ServerTypes) > 0 {
|
|
serverType = info.ServerTypes[0]
|
|
}
|
|
serverID := serverTable[serverType]
|
|
|
|
for seg, segW := range xd {
|
|
fw := math.Min(segW, 0.15) * trueMul
|
|
d[seg] += fw
|
|
if d[seg] > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
|
if vectors[seg] == nil {
|
|
vectors[seg] = make([]float32, 64)
|
|
}
|
|
vectors[seg][serverID] += float32(fw)
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
|
|
d = vectorFilter(d, vectors, desc)
|
|
for k, v := range d {
|
|
if v <= 0.16 {
|
|
delete(d, k)
|
|
}
|
|
}
|
|
return d
|
|
}
|
|
|
|
// ---- vector cosine filtering ----
|
|
|
|
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
|
|
// Compute core vector (sum of all)
|
|
core := make([]float64, 64)
|
|
for _, vec := range vectors {
|
|
for j, v := range vec {
|
|
core[j] += float64(v)
|
|
}
|
|
}
|
|
coreNorm := norm64(core)
|
|
if coreNorm == 0 {
|
|
return d
|
|
}
|
|
|
|
newD := make(map[string]float64, len(d))
|
|
for k, v := range d {
|
|
baseK := strings.Split(k, "/")[0]
|
|
if v > 0.21 && vectors[baseK] != nil {
|
|
vec := vectors[baseK]
|
|
vecNorm := float64(norm32(vec))
|
|
if vecNorm == 0 {
|
|
newD[k] = v
|
|
continue
|
|
}
|
|
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
|
|
if cos > 1.01 {
|
|
cos = 1.01
|
|
}
|
|
newV := math.Max(v*(0.25+cos*0.75), 0.21)
|
|
newD[k] = newV
|
|
} else {
|
|
newD[k] = v
|
|
}
|
|
}
|
|
|
|
// Save cos map for diagnostics
|
|
cosMap := make(map[string]float64)
|
|
for k, vec := range vectors {
|
|
vn := float64(norm32(vec))
|
|
if vn > 0 {
|
|
cosMap[k] = dot32_64(vec, core) / (vn * coreNorm)
|
|
}
|
|
}
|
|
_ = writeJSON(desc+"_cos.json", cosMap)
|
|
|
|
return newD
|
|
}
|
|
|
|
// ---- helpers ----
|
|
|
|
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
|
if len(info.OutLinks) == 0 {
|
|
return 0
|
|
}
|
|
t := info.LastVisitTime
|
|
if t == 0 {
|
|
t = 1640000000
|
|
}
|
|
days := (time.Now().Unix() - t) / (3600 * 24)
|
|
if days > 180 {
|
|
return 0
|
|
}
|
|
timeMul := math.Pow(0.99, float64(days))
|
|
|
|
super := superDomain(host)
|
|
subCount := max(stats.subdomainCount[super], 1)
|
|
tplCount := 1
|
|
if info.HTMLStructure != "" {
|
|
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
|
|
}
|
|
count := max(subCount, int(float64(tplCount)*1.5))
|
|
if count > 1000 {
|
|
if rand.Float64() > 1000.0/float64(count) {
|
|
return 0
|
|
}
|
|
count = 1000
|
|
}
|
|
domainMul := 1.0 / math.Pow(math.Max(float64(count), 5)/5, 0.6)
|
|
return timeMul * domainMul
|
|
}
|
|
|
|
func superDomain(host string) string {
|
|
parts := strings.Split(host, ".")
|
|
if len(parts) >= 2 {
|
|
return strings.Join(parts[len(parts)-2:], ".")
|
|
}
|
|
return host
|
|
}
|
|
|
|
func ipPrefix(ips []string) string {
|
|
if len(ips) == 0 {
|
|
return ""
|
|
}
|
|
sorted := sortedStrings(ips)
|
|
parts := make([]string, len(sorted))
|
|
for i, ip := range sorted {
|
|
idx := strings.LastIndex(ip, ".")
|
|
if idx > 0 {
|
|
parts[i] = ip[:idx]
|
|
} else {
|
|
parts[i] = ip
|
|
}
|
|
}
|
|
return strings.Join(parts, ",")
|
|
}
|
|
|
|
func decomposeURL(rawURL string) []string {
|
|
u := strings.ToLower(rawURL)
|
|
if strings.HasPrefix(u, "https://") {
|
|
u = u[8:]
|
|
} else if strings.HasPrefix(u, "http://") {
|
|
u = u[7:]
|
|
} else {
|
|
return nil
|
|
}
|
|
u = strings.ReplaceAll(u, "?", "/")
|
|
u = strings.ReplaceAll(u, "#", "/")
|
|
u = strings.TrimRight(u, "/")
|
|
if u == "" || u[0] == '/' || u[0] == '%' {
|
|
return nil
|
|
}
|
|
parts := strings.Split(u, "/")
|
|
var out []string
|
|
current := parts[0]
|
|
out = append(out, current)
|
|
for _, p := range parts[1:] {
|
|
current = current + "/" + p
|
|
out = append(out, current)
|
|
}
|
|
return out
|
|
}
|
|
|
|
func buildServerTable(serverCount map[string]int) map[string]int {
|
|
type kv struct {
|
|
k string
|
|
v int
|
|
}
|
|
var sorted []kv
|
|
for k, v := range serverCount {
|
|
sorted = append(sorted, kv{k, v})
|
|
}
|
|
for i := 0; i < len(sorted)-1; i++ {
|
|
for j := i + 1; j < len(sorted); j++ {
|
|
if sorted[j].v > sorted[i].v {
|
|
sorted[i], sorted[j] = sorted[j], sorted[i]
|
|
}
|
|
}
|
|
}
|
|
table := make(map[string]int, 63)
|
|
limit := 63
|
|
if len(sorted) < limit {
|
|
limit = len(sorted)
|
|
}
|
|
for i := 0; i < limit; i++ {
|
|
table[sorted[i].k] = i + 1
|
|
}
|
|
return table
|
|
}
|
|
|
|
func sortedStrings(s []string) []string {
|
|
cp := make([]string, len(s))
|
|
copy(cp, s)
|
|
for i := 0; i < len(cp)-1; i++ {
|
|
for j := i + 1; j < len(cp); j++ {
|
|
if cp[j] < cp[i] {
|
|
cp[i], cp[j] = cp[j], cp[i]
|
|
}
|
|
}
|
|
}
|
|
return cp
|
|
}
|
|
|
|
func norm64(v []float64) float64 {
|
|
s := 0.0
|
|
for _, x := range v {
|
|
s += x * x
|
|
}
|
|
return math.Sqrt(s)
|
|
}
|
|
|
|
func norm32(v []float32) float32 {
|
|
s := float32(0)
|
|
for _, x := range v {
|
|
s += x * x
|
|
}
|
|
return float32(math.Sqrt(float64(s)))
|
|
}
|
|
|
|
func dot32_64(a []float32, b []float64) float64 {
|
|
s := 0.0
|
|
for i := range a {
|
|
s += float64(a[i]) * b[i]
|
|
}
|
|
return s
|
|
}
|
|
|
|
func union(maps ...map[string]float64) map[string]bool {
|
|
out := make(map[string]bool)
|
|
for _, m := range maps {
|
|
for k := range m {
|
|
out[k] = true
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func writeJSON(path string, data interface{}) error {
|
|
_ = os.MkdirAll(filepath.Dir(path), 0o755)
|
|
b, err := json.MarshalIndent(data, "", " ")
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return os.WriteFile(path, b, 0o644)
|
|
}
|
|
|
|
func max(a, b int) int {
|
|
if a > b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|