加上中文注释

2026-04-08 17:48:05 +08:00
parent 6c2f5ad978
commit c154abf410
11 changed files with 830 additions and 560 deletions
@@ -1,33 +1,34 @@
 // Package info loads and serves auxiliary data: backlink scores, adjustment
 // table, and blocked query words.
+// info 包负责加载和管理辅助数据：繁荣表（反向链接分数）、调整表（人工权重调整）和屏蔽词表。
 package info

 import (
-	"encoding/json"
-	"math"
-	"os"
-	"path/filepath"
-	"strings"
-	"sync"
+	"encoding/json"  // JSON 反序列化
+	"math"          // 对数运算（Log2）
+	"os"            // 文件读取
+	"path/filepath" // 路径拼接
+	"strings"       // 字符串操作
+	"sync"          // 读写锁
 )

-// Service loads the prosperity map, adjustment table, and blocked words.
+// Service 管理繁荣表、调整表和屏蔽词表，并提供只读快照。
 type Service struct {
-	mu          sync.RWMutex
-	prosperMap  map[string]float64 // normalised backlink scores
-	adjustTable map[string]float64 // per-domain manual weight adjustments
-	blockedWords map[string]bool
-	storagePath  string
+	mu           sync.RWMutex
+	prosperMap   map[string]float64   // 繁荣表：域名 → 归一化反向链接分数
+	adjustTable  map[string]float64   // 调整表：主机名 → 人工权重倍数（默认 1.0）
+	blockedWords map[string]bool       // 屏蔽词集合：搜索时直接过滤
+	storagePath  string               // 存储根目录路径
 }

-// New creates and loads the info service from storagePath.
+// New 创建并加载 info Service，从 storagePath 目录读取数据文件。
 func New(storagePath string) *Service {
 	s := &Service{storagePath: storagePath}
 	s.Reload()
 	return s
 }

-// Reload re-reads all data files from disk.
+// Reload 从磁盘重新加载所有数据文件（支持热更新配置）。
 func (s *Service) Reload() {
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -36,14 +37,16 @@ func (s *Service) Reload() {
 	s.blockedWords = loadBlockedWords()
 }

-// Prosper returns the backlink score for a URL (sum of its path components).
+// Prosper 返回指定 URL 的繁荣分数（对其所有路径段累计计算）。
+// 分数越高表示该域名越"有价值"（反向链接越多）。
 func (s *Service) Prosper(rawURL string) float64 {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	return prosperFor(rawURL, s.prosperMap)
 }

-// ProsperMap returns the full prosperity map (read-only snapshot).
+// ProsperMap 返回繁荣表的完整只读快照（深拷贝）。
+// 供爬虫调度算法使用。
 func (s *Service) ProsperMap() map[string]float64 {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@@ -54,7 +57,8 @@ func (s *Service) ProsperMap() map[string]float64 {
 	return out
 }

-// Adjust returns the manual weight multiplier for a hostname (default 1.0).
+// Adjust 返回指定主机名的人工权重倍数（默认 1.0）。
+// 允许管理员通过调整表提升或降低某些域名的爬取/搜索优先级。
 func (s *Service) Adjust(host string) float64 {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
@@ -64,17 +68,19 @@ func (s *Service) Adjust(host string) float64 {
 	return 1.0
 }

-// IsBlocked returns true if the word is in the blocked list.
+// IsBlocked 判断某词是否在屏蔽词列表中（搜索时不返回含该词的结果）。
 func (s *Service) IsBlocked(word string) bool {
 	s.mu.RLock()
 	defer s.mu.RUnlock()
 	return s.blockedWords[word]
 }

-// ---- loaders ----
+// ---- 数据加载函数 ----

+// backlinkBaseline 繁荣表归一化的基准值（用于将原始链接数映射到固定区间）。
 const backlinkBaseline = 200000.0

+// loadProsperMap 从 storage/prosper.json 加载繁荣表，并进行归一化和域名树传播。
 func loadProsperMap(storagePath string) map[string]float64 {
 	path := filepath.Join(storagePath, "prosper.json")
 	f, err := os.Open(path)
@@ -89,7 +95,11 @@ func loadProsperMap(storagePath string) map[string]float64 {
 	return normalise(raw)
 }

+// normalise 对繁荣表进行归一化，并执行域名树传播。
+// 归一化：将所有顶级域名的分数总和缩放到 backlinkBaseline。
+// 传播：子域名分数向上传播到父域名（父域名分数不低于任何子域名）。
 func normalise(d map[string]float64) map[string]float64 {
+	// 计算顶级域名（不含 "/"）的分数总和
 	total := 0.0
 	for k, v := range d {
 		if !strings.Contains(k, "/") {
@@ -99,12 +109,13 @@ func normalise(d map[string]float64) map[string]float64 {
 	if total == 0 {
 		return d
 	}
+	// 按总和归一化
 	factor := backlinkBaseline / total
 	out := make(map[string]float64, len(d))
 	for k, v := range d {
 		out[k] = v * factor
 	}
-	// Propagate max score up the domain tree
+	// 域名树传播：子域名分数 ≥ 父域名分数
 	for k, v := range out {
 		now := k
 		for {
@@ -112,9 +123,9 @@ func normalise(d map[string]float64) map[string]float64 {
 			if idx < 0 {
 				break
 			}
-			now = now[idx+1:]
+			now = now[idx+1:] // 上移一级
 			if cur, ok := out[now]; ok && cur < v {
-				out[now] = v
+				out[now] = v // 父域名分数不低于子域名
 			} else if !ok {
 				break
 			}
@@ -123,8 +134,9 @@ func normalise(d map[string]float64) map[string]float64 {
 	return out
 }

+// loadAdjustTable 从 data/adjust.json 加载人工调整表（主机名 → 权重倍数）。
+// 文件不存在时返回空 map（所有域名权重为默认 1.0）。
 func loadAdjustTable() map[string]float64 {
-	// Try loading from data/adjust.json — fallback if absent
 	f, err := os.Open(filepath.Join("data", "adjust.json"))
 	if err != nil {
 		return map[string]float64{}
@@ -135,6 +147,8 @@ func loadAdjustTable() map[string]float64 {
 	return m
 }

+// loadBlockedWords 从 data/blocked_words.json 加载屏蔽词列表。
+// 文件不存在时返回空集合。
 func loadBlockedWords() map[string]bool {
 	f, err := os.Open(filepath.Join("data", "blocked_words.json"))
 	if err != nil {
@@ -150,7 +164,8 @@ func loadBlockedWords() map[string]bool {
 	return m
 }

-// prosperFor computes the prosperity score for a URL by decomposing it.
+// prosperFor 对 URL 按路径段分解查询繁荣表，计算综合繁荣分数。
+// 分数计算：对每段取 Log2 变换后累加，返回值范围约 [0.1, +∞)。
 func prosperFor(rawURL string, pm map[string]float64) float64 {
 	segments := decomposeURL(rawURL)
 	s := 0.0
@@ -161,7 +176,7 @@ func prosperFor(rawURL string, pm map[string]float64) float64 {
 		}
 		l := 0.0
 		if t > 0 {
-			l = math.Log2(2+t*2) - 1
+			l = math.Log2(2+t*2) - 1 // Log2(2+2t)-1，t=0 时为 0，随 t 增大而增大
 		}
 		if s == 0 {
 			if l == 0 {
@@ -169,7 +184,7 @@ func prosperFor(rawURL string, pm map[string]float64) float64 {
 			}
 			s = l
 		} else {
-			s = l + math.Log((s-l)/2+1)
+			s = l + math.Log((s-l)/2+1) // 累加并衰减
 		}
 	}
 	if s > 0 {
@@ -178,7 +193,9 @@ func prosperFor(rawURL string, pm map[string]float64) float64 {
 	return 0
 }

-// decomposeURL yields "domain.tld", "domain.tld/path", "domain.tld/path/sub", ...
+// decomposeURL 将 URL 分解为递增的路径段。
+// 例如："https://zh.wikipedia.org/wiki/Go" → ["zh.wikipedia.org", "zh.wikipedia.org/wiki", "zh.wikipedia.org/wiki/Go"]。
+// 用于按从泛到精的顺序查繁荣表。
 func decomposeURL(rawURL string) []string {
 	u := strings.ToLower(rawURL)
 	if strings.HasPrefix(u, "https://") {
@@ -188,18 +205,19 @@ func decomposeURL(rawURL string) []string {
 	} else {
 		return nil
 	}
-	u = strings.ReplaceAll(u, "?", "/")
-	u = strings.ReplaceAll(u, "#", "/")
+	u = strings.ReplaceAll(u, "?", "/")  // 查询参数转路径
+	u = strings.ReplaceAll(u, "#", "/")  // 锚点转路径
 	u = strings.TrimRight(u, "/")
+	// 过滤无效格式
 	if u == "" || u[0] == '/' || u[0] == '%' || u[0] == ' ' {
 		return nil
 	}
 	parts := strings.Split(u, "/")
 	var out []string
 	current := parts[0]
-	out = append(out, current)
+	out = append(out, current) // 第一段：顶级域名
 	for _, p := range parts[1:] {
-		current = current + "/" + p
+		current = current + "/" + p // 逐步拼接路径段
 		out = append(out, current)
 	}
 	return out