加上中文注释
This commit is contained in:
+68
-44
@@ -1,41 +1,43 @@
|
||||
// Package backlink computes backlink (prosperity) scores for all known domains,
|
||||
// using a PageRank-like algorithm over the site-level link graph.
|
||||
// backlink 包实现 PageRank 类似的反向链接评分算法,在网站级链接图上迭代计算繁荣分数。
|
||||
//
|
||||
// It runs every 48 hours and writes savedata/prosper.json.
|
||||
// 每 48 小时运行一次,将结果写入 savedata/prosper.json,供爬虫调度和搜索排序使用。
|
||||
package backlink
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"math/rand"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
"encoding/json" // JSON 序列化(输出 prosper.json 和 cos map)
|
||||
"log" // 日志
|
||||
"math" // 数学运算(Log、开方、幂)
|
||||
"math/rand" // 随机数(对高频域名采样降权)
|
||||
"os" // 文件写入
|
||||
"path/filepath" // 路径拼接
|
||||
"strings" // 字符串操作
|
||||
"time" // 时间计算(下次运行时间、睡眠)
|
||||
|
||||
"sese-engine/storage"
|
||||
"sese-engine/storage" // 持久化存储
|
||||
)
|
||||
|
||||
// Runner runs the backlink calculation loop.
|
||||
// Runner 管理反向链接计算循环。
|
||||
type Runner struct {
|
||||
db *storage.DB
|
||||
storagePath string
|
||||
storagePath string // 存储根目录(用于写入 prosper.json)
|
||||
}
|
||||
|
||||
// New creates a Runner.
|
||||
// New 创建一个 Runner 实例。
|
||||
func New(db *storage.DB, storagePath string) *Runner {
|
||||
return &Runner{db: db, storagePath: storagePath}
|
||||
}
|
||||
|
||||
// Run loops forever, recalculating every 48 hours.
|
||||
// Run 无限循环,每 48 小时执行一次反向链接计算。
|
||||
// 每次运行对齐到凌晨 2:00(便于在低峰期执行重计算)。
|
||||
func (r *Runner) Run() {
|
||||
for {
|
||||
// Sleep until next scheduled run (aligned to 2am)
|
||||
// 计算距离下次运行(凌晨 2:00)的睡眠时长
|
||||
now := time.Now()
|
||||
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
|
||||
if !target.After(now) {
|
||||
target = target.Add(48 * time.Hour)
|
||||
target = target.Add(48 * time.Hour) // 已过凌晨 2 点,则等明天的 2 点
|
||||
}
|
||||
sleep := target.Sub(now)
|
||||
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
|
||||
@@ -50,46 +52,50 @@ func (r *Runner) Run() {
|
||||
}
|
||||
}
|
||||
|
||||
// RunNow runs one computation cycle immediately (for testing / manual trigger).
|
||||
// RunNow 立即执行一次计算(用于手动触发或测试)。
|
||||
func (r *Runner) RunNow() error {
|
||||
return r.compute()
|
||||
}
|
||||
|
||||
// ---- computation ----
|
||||
// ---- 计算核心 ----
|
||||
|
||||
// siteStats 存放网站图的统计信息,用于多维度过滤和加权。
|
||||
type siteStats struct {
|
||||
subdomainCount map[string]int // superDomain → count
|
||||
templateCount map[string]int // htmlStructure → count
|
||||
sameIPCount map[string]int // ipPrefix → count
|
||||
serverCount map[string]int // serverType → count
|
||||
subdomainCount map[string]int // 顶级域名 → 子域名数量(识别同一组织的多个子站)
|
||||
templateCount map[string]int // HTML 结构特征 → 出现次数(识别姊妹站点/镜像)
|
||||
sameIPCount map[string]int // IP 前缀 → 网站数量(识别同 IP 上的多个网站)
|
||||
serverCount map[string]int // Server 类型组合 → 出现次数(识别同服务器部署的网站)
|
||||
}
|
||||
|
||||
// compute 执行完整的反向链接计算流程。
|
||||
// 包含:统计收集 → HTTPS/HTTP 分别迭代 → 合并 → 写入文件。
|
||||
func (r *Runner) compute() error {
|
||||
stats := r.collectStats()
|
||||
|
||||
// Phase 1: HTTPS sites
|
||||
// 阶段一:HTTPS 网站的 PageRank 迭代
|
||||
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
|
||||
}, stats, "https_backlink")
|
||||
|
||||
// Phase 1a: second pass (echo) using d1 scores
|
||||
// 阶段一增强(Echo):用 d1 结果加权再做一轮迭代,放大已有繁荣值的域名
|
||||
d1a := r.aggregateWithScores(d1, stats, "echo")
|
||||
|
||||
// Phase 2: HTTP-only sites
|
||||
// 阶段二:HTTP only 网站的迭代(独立计算,不混入 HTTPS 分数)
|
||||
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
|
||||
}, stats, "http_backlink")
|
||||
|
||||
// Merge
|
||||
// 三路合并:HTTPS 分数主导,Echo 辅助,HTTP 补充
|
||||
merged := make(map[string]float64)
|
||||
for k := range union(d1, d2, d1a) {
|
||||
// 混合公式:HTTPS × 1 + Echo × 1 + min(HTTPS×0.5 + HTTP×0.1, HTTP)
|
||||
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
|
||||
if v > 0.16 {
|
||||
merged[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
// Save
|
||||
// 写入文件
|
||||
path := filepath.Join(r.storagePath, "prosper.json")
|
||||
if err := writeJSON(path, merged); err != nil {
|
||||
return err
|
||||
@@ -98,7 +104,8 @@ func (r *Runner) compute() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectStats builds statistics about the site graph.
|
||||
// collectStats 遍历所有网站元信息,统计子域名、HTML 模板、IP、Server 类型分布。
|
||||
// 低于阈值(4)的统计项被剔除,以减少噪声影响。
|
||||
func (r *Runner) collectStats() *siteStats {
|
||||
stats := &siteStats{
|
||||
subdomainCount: make(map[string]int),
|
||||
@@ -125,7 +132,7 @@ func (r *Runner) collectStats() *siteStats {
|
||||
return nil
|
||||
})
|
||||
|
||||
// Prune counts below threshold
|
||||
// 剔除低频统计项
|
||||
for k, v := range stats.subdomainCount {
|
||||
if v < 4 {
|
||||
delete(stats.subdomainCount, k)
|
||||
@@ -144,13 +151,14 @@ func (r *Runner) collectStats() *siteStats {
|
||||
return stats
|
||||
}
|
||||
|
||||
// aggregate computes a backlink score map for sites matching the filter.
|
||||
// aggregate 执行一轮 PageRank 风格的链接权重迭代。
|
||||
// filter 筛选纳入计算的目标网站集合;desc 为日志标识。
|
||||
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
|
||||
log.Printf("[backlink] aggregating: %s", desc)
|
||||
d := make(map[string]float64)
|
||||
ipSource := make(map[string]float64)
|
||||
|
||||
// Build server type index (top 63 most common)
|
||||
// 建立 Server 类型的 ID 映射表(最多 63 种,用于构建向量)
|
||||
serverTable := buildServerTable(stats.serverCount)
|
||||
|
||||
type vectorEntry struct {
|
||||
@@ -166,7 +174,7 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
if filter != nil && !filter(info) {
|
||||
return nil
|
||||
}
|
||||
mul := computeMul(host, info, stats)
|
||||
mul := computeMul(host, info, stats) // 计算域名综合乘数(时间衰减 + 子域名降权)
|
||||
if mul == 0 {
|
||||
return nil
|
||||
}
|
||||
@@ -176,6 +184,7 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
return nil
|
||||
}
|
||||
|
||||
// 每条出站链接的初始权重:1/max(n, 50),出站越多每条分得越少
|
||||
w := 1.0 / math.Max(float64(n), 50)
|
||||
xd := make(map[string]float64)
|
||||
for _, link := range info.OutLinks {
|
||||
@@ -196,10 +205,11 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
serverID := serverTable[serverType]
|
||||
|
||||
for seg, segW := range xd {
|
||||
fw := math.Min(segW, 0.15) * mul
|
||||
fw := math.Min(segW, 0.15) * mul // 截断上限 0.15,防止单链接权重过高
|
||||
prev := d[seg]
|
||||
d[seg] = prev + fw
|
||||
|
||||
// IP 来源去重:来自同一 IP 段的高权重链接在超过 0.4 后跳过,防止 IP 污染
|
||||
if prev > 0.2 {
|
||||
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
|
||||
key := seg + "-" + ipStr
|
||||
@@ -210,6 +220,7 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
}
|
||||
}
|
||||
|
||||
// 构建向量:域名 → Server 类型向量(用于余弦相似度过滤)
|
||||
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
||||
if vectors[seg] == nil {
|
||||
vectors[seg] = make([]float32, 64)
|
||||
@@ -219,8 +230,8 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
}
|
||||
|
||||
i++
|
||||
// 每 20 万条遍历后清理低分条目,防止内存膨胀
|
||||
if i%200000 == 0 {
|
||||
// Prune low-score entries
|
||||
for k, v := range d {
|
||||
if v < pruneThreshold {
|
||||
delete(d, k)
|
||||
@@ -238,10 +249,10 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
return nil
|
||||
})
|
||||
|
||||
// Vectorised cosine filtering
|
||||
// 向量余弦过滤:去除 Server 类型特征偏离核心向量的域名(可能是噪音/作弊)
|
||||
d = vectorFilter(d, vectors, desc)
|
||||
|
||||
// Prune
|
||||
// 最终清理:分数 ≤ 0.16 的域名不写入(低于此阈值认为不繁荣)
|
||||
for k, v := range d {
|
||||
if v <= 0.16 {
|
||||
delete(d, k)
|
||||
@@ -252,7 +263,8 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
||||
return d
|
||||
}
|
||||
|
||||
// aggregateWithScores does a second pass weighted by existing scores.
|
||||
// aggregateWithScores 在已有繁荣分数的基础上加权再做一轮迭代(Echo 阶段)。
|
||||
// 对已有分数的域名给予更高权重(乘以 log2(2+score)),使强者更强。
|
||||
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
|
||||
log.Printf("[backlink] aggregating with scores: %s", desc)
|
||||
d := make(map[string]float64)
|
||||
@@ -268,6 +280,7 @@ func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats
|
||||
if mul == 0 {
|
||||
return nil
|
||||
}
|
||||
// 已有分数的域名获得加权乘数(上限 2×)
|
||||
trueMul := math.Min(2, mul*math.Log2(2+score))
|
||||
|
||||
n := len(info.OutLinks)
|
||||
@@ -309,10 +322,12 @@ func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats
|
||||
return d
|
||||
}
|
||||
|
||||
// ---- vector cosine filtering ----
|
||||
// ---- 向量余弦过滤 ----
|
||||
|
||||
// vectorFilter 使用余弦相似度过滤域名分数:保留与核心 Server 类型向量相似的域名。
|
||||
// 与核心方向偏离的域名可能是噪音(如作弊农场、链接买卖)。
|
||||
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
|
||||
// Compute core vector (sum of all)
|
||||
// 计算全网站的 Server 类型核心向量(所有向量求和)
|
||||
core := make([]float64, 64)
|
||||
for _, vec := range vectors {
|
||||
for j, v := range vec {
|
||||
@@ -334,10 +349,12 @@ func vectorFilter(d map[string]float64, vectors map[string][]float32, desc strin
|
||||
newD[k] = v
|
||||
continue
|
||||
}
|
||||
// 余弦相似度:范围 [-1, 1]
|
||||
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
|
||||
if cos > 1.01 {
|
||||
cos = 1.01
|
||||
}
|
||||
// cos × 0.75 + 0.25:确保最低也有 0.25 的权重,不完全剔除
|
||||
newV := math.Max(v*(0.25+cos*0.75), 0.21)
|
||||
newD[k] = newV
|
||||
} else {
|
||||
@@ -345,7 +362,7 @@ func vectorFilter(d map[string]float64, vectors map[string][]float32, desc strin
|
||||
}
|
||||
}
|
||||
|
||||
// Save cos map for diagnostics
|
||||
// 保存 cos map 用于诊断
|
||||
cosMap := make(map[string]float64)
|
||||
for k, vec := range vectors {
|
||||
vn := float64(norm32(vec))
|
||||
@@ -358,8 +375,10 @@ func vectorFilter(d map[string]float64, vectors map[string][]float32, desc strin
|
||||
return newD
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
// ---- 辅助函数 ----
|
||||
|
||||
// computeMul 计算某网站在繁荣值计算中的综合乘数。
|
||||
// 综合考虑:最后访问时间(超过 180 天排除)、子域名数量(越多平均分越低)。
|
||||
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||
if len(info.OutLinks) == 0 {
|
||||
return 0
|
||||
@@ -370,7 +389,7 @@ func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||
}
|
||||
days := (time.Now().Unix() - t) / (3600 * 24)
|
||||
if days > 180 {
|
||||
return 0
|
||||
return 0 // 半年未更新,排除
|
||||
}
|
||||
timeMul := math.Pow(0.99, float64(days))
|
||||
|
||||
@@ -381,6 +400,7 @@ func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
|
||||
}
|
||||
count := max(subCount, int(float64(tplCount)*1.5))
|
||||
// 高频域名随机丢弃:保持最多 1000 个域名参与计算(减少重复镜像的投票)
|
||||
if count > 1000 {
|
||||
if rand.Float64() > 1000.0/float64(count) {
|
||||
return 0
|
||||
@@ -391,6 +411,7 @@ func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||
return timeMul * domainMul
|
||||
}
|
||||
|
||||
// superDomain 提取顶级域名(去除子域名)。
|
||||
func superDomain(host string) string {
|
||||
parts := strings.Split(host, ".")
|
||||
if len(parts) >= 2 {
|
||||
@@ -399,6 +420,7 @@ func superDomain(host string) string {
|
||||
return host
|
||||
}
|
||||
|
||||
// ipPrefix 将 IP 列表去重排序后返回逗号拼接的 /24 前缀(用于识别同 C 段主机)。
|
||||
func ipPrefix(ips []string) string {
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
@@ -408,7 +430,7 @@ func ipPrefix(ips []string) string {
|
||||
for i, ip := range sorted {
|
||||
idx := strings.LastIndex(ip, ".")
|
||||
if idx > 0 {
|
||||
parts[i] = ip[:idx]
|
||||
parts[i] = ip[:idx] // 取 /24 前缀
|
||||
} else {
|
||||
parts[i] = ip
|
||||
}
|
||||
@@ -416,6 +438,7 @@ func ipPrefix(ips []string) string {
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
// decomposeURL 将 URL 分解为递增路径段(同 info 包)。
|
||||
func decomposeURL(rawURL string) []string {
|
||||
u := strings.ToLower(rawURL)
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
@@ -442,6 +465,7 @@ func decomposeURL(rawURL string) []string {
|
||||
return out
|
||||
}
|
||||
|
||||
// buildServerTable 将 Server 类型按频率降序排列,取前 63 种分配 ID(0 不用)。
|
||||
func buildServerTable(serverCount map[string]int) map[string]int {
|
||||
type kv struct {
|
||||
k string
|
||||
@@ -449,7 +473,7 @@ func buildServerTable(serverCount map[string]int) map[string]int {
|
||||
}
|
||||
var sorted []kv
|
||||
for k, v := range serverCount {
|
||||
sorted = append(sorted, kv{k, v})
|
||||
sorted = append(sorted, kv{k: k, v: v})
|
||||
}
|
||||
for i := 0; i < len(sorted)-1; i++ {
|
||||
for j := i + 1; j < len(sorted); j++ {
|
||||
|
||||
Reference in New Issue
Block a user