Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1,533 @@
|
||||
// Package backlink computes backlink (prosperity) scores for all known domains,
|
||||
// using a PageRank-like algorithm over the site-level link graph.
|
||||
//
|
||||
// It runs every 48 hours and writes savedata/prosper.json.
|
||||
package backlink
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"math/rand"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
// Runner runs the backlink calculation loop.
|
||||
type Runner struct {
|
||||
db *storage.DB
|
||||
storagePath string
|
||||
}
|
||||
|
||||
// New creates a Runner.
|
||||
func New(db *storage.DB, storagePath string) *Runner {
|
||||
return &Runner{db: db, storagePath: storagePath}
|
||||
}
|
||||
|
||||
// Run loops forever, recalculating every 48 hours.
|
||||
func (r *Runner) Run() {
|
||||
for {
|
||||
// Sleep until next scheduled run (aligned to 2am)
|
||||
now := time.Now()
|
||||
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
|
||||
if !target.After(now) {
|
||||
target = target.Add(48 * time.Hour)
|
||||
}
|
||||
sleep := target.Sub(now)
|
||||
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
|
||||
time.Sleep(sleep)
|
||||
|
||||
log.Printf("[backlink] starting computation at %v", time.Now().Format(time.RFC3339))
|
||||
if err := r.compute(); err != nil {
|
||||
log.Printf("[backlink] error: %v", err)
|
||||
} else {
|
||||
log.Printf("[backlink] done")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RunNow runs one computation cycle immediately (for testing / manual trigger).
|
||||
func (r *Runner) RunNow() error {
|
||||
return r.compute()
|
||||
}
|
||||
|
||||
// ---- computation ----
|
||||
|
||||
type siteStats struct {
|
||||
subdomainCount map[string]int // superDomain → count
|
||||
templateCount map[string]int // htmlStructure → count
|
||||
sameIPCount map[string]int // ipPrefix → count
|
||||
serverCount map[string]int // serverType → count
|
||||
}
|
||||
|
||||
func (r *Runner) compute() error {
|
||||
stats := r.collectStats()
|
||||
|
||||
// Phase 1: HTTPS sites
|
||||
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
|
||||
}, stats, "https_backlink")
|
||||
|
||||
// Phase 1a: second pass (echo) using d1 scores
|
||||
d1a := r.aggregateWithScores(d1, stats, "echo")
|
||||
|
||||
// Phase 2: HTTP-only sites
|
||||
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
|
||||
}, stats, "http_backlink")
|
||||
|
||||
// Merge
|
||||
merged := make(map[string]float64)
|
||||
for k := range union(d1, d2, d1a) {
|
||||
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
|
||||
if v > 0.16 {
|
||||
merged[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
// Save
|
||||
path := filepath.Join(r.storagePath, "prosper.json")
|
||||
if err := writeJSON(path, merged); err != nil {
|
||||
return err
|
||||
}
|
||||
log.Printf("[backlink] wrote %d entries to %s", len(merged), path)
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectStats builds statistics about the site graph.
|
||||
func (r *Runner) collectStats() *siteStats {
|
||||
stats := &siteStats{
|
||||
subdomainCount: make(map[string]int),
|
||||
templateCount: make(map[string]int),
|
||||
sameIPCount: make(map[string]int),
|
||||
serverCount: make(map[string]int),
|
||||
}
|
||||
|
||||
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
||||
super := superDomain(host)
|
||||
stats.subdomainCount[super]++
|
||||
|
||||
if info.HTMLStructure != "" {
|
||||
stats.templateCount[info.HTMLStructure]++
|
||||
}
|
||||
if len(info.IPs) > 0 {
|
||||
ipStr := ipPrefix(info.IPs)
|
||||
stats.sameIPCount[ipStr]++
|
||||
}
|
||||
if len(info.ServerTypes) > 0 {
|
||||
s := strings.Join(sortedStrings(info.ServerTypes), ",")
|
||||
stats.serverCount[s]++
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Prune counts below threshold
|
||||
for k, v := range stats.subdomainCount {
|
||||
if v < 4 {
|
||||
delete(stats.subdomainCount, k)
|
||||
}
|
||||
}
|
||||
for k, v := range stats.templateCount {
|
||||
if v < 4 {
|
||||
delete(stats.templateCount, k)
|
||||
}
|
||||
}
|
||||
for k, v := range stats.sameIPCount {
|
||||
if v < 4 {
|
||||
delete(stats.sameIPCount, k)
|
||||
}
|
||||
}
|
||||
return stats
|
||||
}
|
||||
|
||||
// aggregate computes a backlink score map for sites matching the filter.
|
||||
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
|
||||
log.Printf("[backlink] aggregating: %s", desc)
|
||||
d := make(map[string]float64)
|
||||
ipSource := make(map[string]float64)
|
||||
|
||||
// Build server type index (top 63 most common)
|
||||
serverTable := buildServerTable(stats.serverCount)
|
||||
|
||||
type vectorEntry struct {
|
||||
domain string
|
||||
vec []float32
|
||||
}
|
||||
vectors := make(map[string][]float32)
|
||||
|
||||
pruneThreshold := 0.02
|
||||
i := 0
|
||||
|
||||
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
||||
if filter != nil && !filter(info) {
|
||||
return nil
|
||||
}
|
||||
mul := computeMul(host, info, stats)
|
||||
if mul == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
n := len(info.OutLinks)
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
w := 1.0 / math.Max(float64(n), 50)
|
||||
xd := make(map[string]float64)
|
||||
for _, link := range info.OutLinks {
|
||||
for _, seg := range decomposeURL(link) {
|
||||
if _, exists := xd[seg]; !exists {
|
||||
xd[seg] = w
|
||||
} else {
|
||||
xd[seg] += w
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ipStr := ipPrefix(info.IPs)
|
||||
serverType := ""
|
||||
if len(info.ServerTypes) > 0 {
|
||||
serverType = info.ServerTypes[0]
|
||||
}
|
||||
serverID := serverTable[serverType]
|
||||
|
||||
for seg, segW := range xd {
|
||||
fw := math.Min(segW, 0.15) * mul
|
||||
prev := d[seg]
|
||||
d[seg] = prev + fw
|
||||
|
||||
if prev > 0.2 {
|
||||
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
|
||||
key := seg + "-" + ipStr
|
||||
if ipSource[key] > 0.4 {
|
||||
continue
|
||||
}
|
||||
ipSource[key] += fw
|
||||
}
|
||||
}
|
||||
|
||||
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
||||
if vectors[seg] == nil {
|
||||
vectors[seg] = make([]float32, 64)
|
||||
}
|
||||
vectors[seg][serverID] += float32(fw)
|
||||
}
|
||||
}
|
||||
|
||||
i++
|
||||
if i%200000 == 0 {
|
||||
// Prune low-score entries
|
||||
for k, v := range d {
|
||||
if v < pruneThreshold {
|
||||
delete(d, k)
|
||||
}
|
||||
}
|
||||
pruneThreshold *= 1.1
|
||||
}
|
||||
if i%400000 == 0 {
|
||||
for k, v := range ipSource {
|
||||
if v < 0.04 {
|
||||
delete(ipSource, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Vectorised cosine filtering
|
||||
d = vectorFilter(d, vectors, desc)
|
||||
|
||||
// Prune
|
||||
for k, v := range d {
|
||||
if v <= 0.16 {
|
||||
delete(d, k)
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[backlink] %s: %d entries", desc, len(d))
|
||||
return d
|
||||
}
|
||||
|
||||
// aggregateWithScores does a second pass weighted by existing scores.
|
||||
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
|
||||
log.Printf("[backlink] aggregating with scores: %s", desc)
|
||||
d := make(map[string]float64)
|
||||
serverTable := buildServerTable(stats.serverCount)
|
||||
vectors := make(map[string][]float32)
|
||||
|
||||
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
||||
score, ok := scores[host]
|
||||
if !ok || strings.Contains(host, "/") {
|
||||
return nil
|
||||
}
|
||||
mul := computeMul(host, info, stats)
|
||||
if mul == 0 {
|
||||
return nil
|
||||
}
|
||||
trueMul := math.Min(2, mul*math.Log2(2+score))
|
||||
|
||||
n := len(info.OutLinks)
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
w := 1.0 / math.Max(float64(n), 50)
|
||||
xd := make(map[string]float64)
|
||||
for _, link := range info.OutLinks {
|
||||
for _, seg := range decomposeURL(link) {
|
||||
xd[seg] += w
|
||||
}
|
||||
}
|
||||
serverType := ""
|
||||
if len(info.ServerTypes) > 0 {
|
||||
serverType = info.ServerTypes[0]
|
||||
}
|
||||
serverID := serverTable[serverType]
|
||||
|
||||
for seg, segW := range xd {
|
||||
fw := math.Min(segW, 0.15) * trueMul
|
||||
d[seg] += fw
|
||||
if d[seg] > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
||||
if vectors[seg] == nil {
|
||||
vectors[seg] = make([]float32, 64)
|
||||
}
|
||||
vectors[seg][serverID] += float32(fw)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
d = vectorFilter(d, vectors, desc)
|
||||
for k, v := range d {
|
||||
if v <= 0.16 {
|
||||
delete(d, k)
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// ---- vector cosine filtering ----
|
||||
|
||||
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
|
||||
// Compute core vector (sum of all)
|
||||
core := make([]float64, 64)
|
||||
for _, vec := range vectors {
|
||||
for j, v := range vec {
|
||||
core[j] += float64(v)
|
||||
}
|
||||
}
|
||||
coreNorm := norm64(core)
|
||||
if coreNorm == 0 {
|
||||
return d
|
||||
}
|
||||
|
||||
newD := make(map[string]float64, len(d))
|
||||
for k, v := range d {
|
||||
baseK := strings.Split(k, "/")[0]
|
||||
if v > 0.21 && vectors[baseK] != nil {
|
||||
vec := vectors[baseK]
|
||||
vecNorm := float64(norm32(vec))
|
||||
if vecNorm == 0 {
|
||||
newD[k] = v
|
||||
continue
|
||||
}
|
||||
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
|
||||
if cos > 1.01 {
|
||||
cos = 1.01
|
||||
}
|
||||
newV := math.Max(v*(0.25+cos*0.75), 0.21)
|
||||
newD[k] = newV
|
||||
} else {
|
||||
newD[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
// Save cos map for diagnostics
|
||||
cosMap := make(map[string]float64)
|
||||
for k, vec := range vectors {
|
||||
vn := float64(norm32(vec))
|
||||
if vn > 0 {
|
||||
cosMap[k] = dot32_64(vec, core) / (vn * coreNorm)
|
||||
}
|
||||
}
|
||||
_ = writeJSON(desc+"_cos.json", cosMap)
|
||||
|
||||
return newD
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
|
||||
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||
if len(info.OutLinks) == 0 {
|
||||
return 0
|
||||
}
|
||||
t := info.LastVisitTime
|
||||
if t == 0 {
|
||||
t = 1640000000
|
||||
}
|
||||
days := (time.Now().Unix() - t) / (3600 * 24)
|
||||
if days > 180 {
|
||||
return 0
|
||||
}
|
||||
timeMul := math.Pow(0.99, float64(days))
|
||||
|
||||
super := superDomain(host)
|
||||
subCount := max(stats.subdomainCount[super], 1)
|
||||
tplCount := 1
|
||||
if info.HTMLStructure != "" {
|
||||
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
|
||||
}
|
||||
count := max(subCount, int(float64(tplCount)*1.5))
|
||||
if count > 1000 {
|
||||
if rand.Float64() > 1000.0/float64(count) {
|
||||
return 0
|
||||
}
|
||||
count = 1000
|
||||
}
|
||||
domainMul := 1.0 / math.Pow(math.Max(float64(count), 5)/5, 0.6)
|
||||
return timeMul * domainMul
|
||||
}
|
||||
|
||||
func superDomain(host string) string {
|
||||
parts := strings.Split(host, ".")
|
||||
if len(parts) >= 2 {
|
||||
return strings.Join(parts[len(parts)-2:], ".")
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
func ipPrefix(ips []string) string {
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sorted := sortedStrings(ips)
|
||||
parts := make([]string, len(sorted))
|
||||
for i, ip := range sorted {
|
||||
idx := strings.LastIndex(ip, ".")
|
||||
if idx > 0 {
|
||||
parts[i] = ip[:idx]
|
||||
} else {
|
||||
parts[i] = ip
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func decomposeURL(rawURL string) []string {
|
||||
u := strings.ToLower(rawURL)
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
u = u[8:]
|
||||
} else if strings.HasPrefix(u, "http://") {
|
||||
u = u[7:]
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
u = strings.ReplaceAll(u, "?", "/")
|
||||
u = strings.ReplaceAll(u, "#", "/")
|
||||
u = strings.TrimRight(u, "/")
|
||||
if u == "" || u[0] == '/' || u[0] == '%' {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(u, "/")
|
||||
var out []string
|
||||
current := parts[0]
|
||||
out = append(out, current)
|
||||
for _, p := range parts[1:] {
|
||||
current = current + "/" + p
|
||||
out = append(out, current)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func buildServerTable(serverCount map[string]int) map[string]int {
|
||||
type kv struct {
|
||||
k string
|
||||
v int
|
||||
}
|
||||
var sorted []kv
|
||||
for k, v := range serverCount {
|
||||
sorted = append(sorted, kv{k, v})
|
||||
}
|
||||
for i := 0; i < len(sorted)-1; i++ {
|
||||
for j := i + 1; j < len(sorted); j++ {
|
||||
if sorted[j].v > sorted[i].v {
|
||||
sorted[i], sorted[j] = sorted[j], sorted[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
table := make(map[string]int, 63)
|
||||
limit := 63
|
||||
if len(sorted) < limit {
|
||||
limit = len(sorted)
|
||||
}
|
||||
for i := 0; i < limit; i++ {
|
||||
table[sorted[i].k] = i + 1
|
||||
}
|
||||
return table
|
||||
}
|
||||
|
||||
func sortedStrings(s []string) []string {
|
||||
cp := make([]string, len(s))
|
||||
copy(cp, s)
|
||||
for i := 0; i < len(cp)-1; i++ {
|
||||
for j := i + 1; j < len(cp); j++ {
|
||||
if cp[j] < cp[i] {
|
||||
cp[i], cp[j] = cp[j], cp[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return cp
|
||||
}
|
||||
|
||||
func norm64(v []float64) float64 {
|
||||
s := 0.0
|
||||
for _, x := range v {
|
||||
s += x * x
|
||||
}
|
||||
return math.Sqrt(s)
|
||||
}
|
||||
|
||||
func norm32(v []float32) float32 {
|
||||
s := float32(0)
|
||||
for _, x := range v {
|
||||
s += x * x
|
||||
}
|
||||
return float32(math.Sqrt(float64(s)))
|
||||
}
|
||||
|
||||
func dot32_64(a []float32, b []float64) float64 {
|
||||
s := 0.0
|
||||
for i := range a {
|
||||
s += float64(a[i]) * b[i]
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func union(maps ...map[string]float64) map[string]bool {
|
||||
out := make(map[string]bool)
|
||||
for _, m := range maps {
|
||||
for k := range m {
|
||||
out[k] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func writeJSON(path string, data interface{}) error {
|
||||
_ = os.MkdirAll(filepath.Dir(path), 0o755)
|
||||
b, err := json.MarshalIndent(data, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, b, 0o644)
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
Reference in New Issue
Block a user