加上中文注释
This commit is contained in:
+92
-66
@@ -1,43 +1,45 @@
|
|||||||
// Package analyzer provides keyword extraction and language detection.
|
// Package analyzer provides keyword extraction and language detection.
|
||||||
|
// analyzer 包提供文本分词、关键词提取和语种检测功能。
|
||||||
//
|
//
|
||||||
// Keyword extraction uses gojieba for Chinese segmentation and simple token
|
// 分词策略:中文使用 gojieba(结巴分词 C++)进行精确/搜索模式分词;
|
||||||
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
|
// 纯 ASCII 英文按空格切分。语言检测使用 lingua-go(纯 Go,无外部模型文件)。
|
||||||
package analyzer
|
package analyzer
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json" // JSON 反序列化(加载屏蔽词列表)
|
||||||
"math"
|
"math" // 数学运算(最小值、开方)
|
||||||
"os"
|
"os" // 文件系统操作(读取屏蔽词文件)
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
"sync"
|
"sync" // 互斥锁(保护 jieba 的非线程安全调用)
|
||||||
"unicode"
|
"unicode" // Unicode 字符判断
|
||||||
|
|
||||||
"github.com/pemistahl/lingua-go"
|
"github.com/pemistahl/lingua-go" // 纯 Go 语言检测库(支持 75 种语言)
|
||||||
"github.com/yanyiwu/gojieba"
|
"github.com/yanyiwu/gojieba" // Gojieba:C++ 结巴分词的 Go 封装
|
||||||
)
|
)
|
||||||
|
|
||||||
// Keyword holds a (word, weight) pair.
|
// Keyword 表示一个关键词及其权重。
|
||||||
type Keyword struct {
|
type Keyword struct {
|
||||||
Word string `json:"word"`
|
Word string `json:"word"` // 分词后的单词/词组
|
||||||
Weight float32 `json:"weight"`
|
Weight float32 `json:"weight"` // TF(词频)权重,反映该词在页面中的重要程度
|
||||||
}
|
}
|
||||||
|
|
||||||
// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
|
// Analyzer 封装结巴分词和语言检测器,提供线程安全的分析流水线。
|
||||||
type Analyzer struct {
|
type Analyzer struct {
|
||||||
jieba *gojieba.Jieba
|
jieba *gojieba.Jieba // 结巴分词句柄(C++ 实现,非线程安全)
|
||||||
detector lingua.LanguageDetector
|
detector lingua.LanguageDetector // 语言检测器(lingua-go)
|
||||||
stopWords map[string]bool
|
stopWords map[string]bool // 屏蔽词集合(标点符号、停用词等)
|
||||||
mu sync.Mutex // gojieba is not goroutine-safe
|
mu sync.Mutex // 保护 jieba 调用的互斥锁
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates an Analyzer.
|
// New 创建一个 Analyzer 实例。
|
||||||
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
|
// modelPath:语言模型路径(已废弃,lingua-go 使用内置数据,无需外部文件);
|
||||||
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
|
// stopWordsPath:屏蔽词 JSON 文件路径(不含文件时传入空字符串)。
|
||||||
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
||||||
|
// 初始化结巴分词(加载词典,需调用 Free 释放)
|
||||||
j := gojieba.NewJieba()
|
j := gojieba.NewJieba()
|
||||||
|
|
||||||
// Build a lingua detector that covers the languages we care about.
|
// 构建 lingua 语言检测器,覆盖所有 75 种语言(含中日韩英等)
|
||||||
// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
|
// MinimumRelativeDistance=0.15:降低检测阈值,提高短文本召回率
|
||||||
detector := lingua.NewLanguageDetectorBuilder().
|
detector := lingua.NewLanguageDetectorBuilder().
|
||||||
FromAllLanguages().
|
FromAllLanguages().
|
||||||
WithMinimumRelativeDistance(0.15).
|
WithMinimumRelativeDistance(0.15).
|
||||||
@@ -52,12 +54,14 @@ func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
|||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close releases resources held by the analyzer.
|
// Close 释放 Analyzer 持有的资源(主要是结巴分词的 C++ 内存)。
|
||||||
func (a *Analyzer) Close() {
|
func (a *Analyzer) Close() {
|
||||||
a.jieba.Free()
|
a.jieba.Free()
|
||||||
}
|
}
|
||||||
|
|
||||||
// loadStopWords reads a JSON array of stop-word strings.
|
// loadStopWords 从 JSON 文件加载屏蔽词列表到 map 中(O(1) 查找)。
|
||||||
|
// JSON 格式:字符串数组,如 [",", "。", "的", "了"]。
|
||||||
|
// 文件不存在或格式错误时返回空 map。
|
||||||
func loadStopWords(path string) map[string]bool {
|
func loadStopWords(path string) map[string]bool {
|
||||||
if path == "" {
|
if path == "" {
|
||||||
return map[string]bool{}
|
return map[string]bool{}
|
||||||
@@ -73,23 +77,29 @@ func loadStopWords(path string) map[string]bool {
|
|||||||
}
|
}
|
||||||
m := make(map[string]bool, len(words))
|
m := make(map[string]bool, len(words))
|
||||||
for _, w := range words {
|
for _, w := range words {
|
||||||
m[strings.ToLower(w)] = true
|
m[strings.ToLower(w)] = true // 转为小写存储,大小写不敏感
|
||||||
}
|
}
|
||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
|
// Tokenize 将字符串分词为词列表。
|
||||||
|
// searchMode=true:搜索模式分词(更细粒度,适合搜索查询);
|
||||||
|
// searchMode=false:精确模式分词(适合页面内容分析)。
|
||||||
|
// 策略:纯 ASCII 字母数字按空格切分;含中文/其他字符的片段交给结巴处理。
|
||||||
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
|
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
|
||||||
|
// 超长文本截断(jieba 对极长文本处理效率下降)
|
||||||
if len(s) > 10000 {
|
if len(s) > 10000 {
|
||||||
s = s[:10000]
|
s = s[:10000]
|
||||||
}
|
}
|
||||||
// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
|
// 清洗非 UTF-8 字节,防止 gojieba 的 C++ 层报 "decode failed" 错误
|
||||||
s = strings.ToValidUTF8(s, "")
|
s = strings.ToValidUTF8(s, "")
|
||||||
var result []string
|
var result []string
|
||||||
for _, part := range strings.Fields(s) {
|
for _, part := range strings.Fields(s) {
|
||||||
if isASCIIAlnum(part) {
|
if isASCIIAlnum(part) {
|
||||||
|
// 纯 ASCII 片段直接保留,不走 jieba
|
||||||
result = append(result, part)
|
result = append(result, part)
|
||||||
} else {
|
} else {
|
||||||
|
// 非 ASCII(含中文/日文等):加锁后调用 jieba 分词
|
||||||
a.mu.Lock()
|
a.mu.Lock()
|
||||||
var tokens []string
|
var tokens []string
|
||||||
if searchMode {
|
if searchMode {
|
||||||
@@ -104,71 +114,84 @@ func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
|
// Normalize 标准化字符串:去除非字母数字非 CJK 字符,并转为小写。
|
||||||
|
// 用于分词后清洗,使不同大小写/全角格式的同一词归一为统一形式。
|
||||||
func Normalize(s string) string {
|
func Normalize(s string) string {
|
||||||
var b strings.Builder
|
var b strings.Builder
|
||||||
for _, r := range s {
|
for _, r := range s {
|
||||||
|
// 保留:英文字母(a-zA-Z0-9)和 CJK 统一汉字(0x4e00-0x9fa5)
|
||||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
|
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
|
||||||
if r >= 'A' && r <= 'Z' {
|
if r >= 'A' && r <= 'Z' {
|
||||||
b.WriteRune(unicode.ToLower(r))
|
b.WriteRune(unicode.ToLower(r)) // 大写转小写
|
||||||
} else {
|
} else {
|
||||||
b.WriteRune(r)
|
b.WriteRune(r) // 直接写入
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return b.String()
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
|
// weightedTokens 对一段文本计算每个分词的 TF(词频)权重,返回 token→权重 map。
|
||||||
|
// w 为权重倍数(标题权重 1.0,描述权重 0.5,正文权重 1.0)。
|
||||||
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
|
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
|
||||||
tokens := a.Tokenize(text, false)
|
tokens := a.Tokenize(text, false)
|
||||||
d := make(map[string]float32)
|
d := make(map[string]float32)
|
||||||
|
// 归一化分母:至少 8,防止只有 1-2 个词时权重过大
|
||||||
n := math.Max(8, float64(len(tokens)))
|
n := math.Max(8, float64(len(tokens)))
|
||||||
counts := make(map[string]int)
|
counts := make(map[string]int)
|
||||||
for _, t := range tokens {
|
for _, t := range tokens {
|
||||||
t = Normalize(t)
|
t = Normalize(t)
|
||||||
|
// 跳过空词、屏蔽词、超长词(超过 32 字符)
|
||||||
if t == "" || a.stopWords[t] || len(t) > 32 {
|
if t == "" || a.stopWords[t] || len(t) > 32 {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
counts[t]++
|
counts[t]++
|
||||||
}
|
}
|
||||||
|
// 权重计算:min(0.2, 词频/总词数) × 权重倍数
|
||||||
|
// 即单词权重上限 0.2,避免某个词过度主导
|
||||||
for k, v := range counts {
|
for k, v := range counts {
|
||||||
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
|
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
|
||||||
}
|
}
|
||||||
return d
|
return d
|
||||||
}
|
}
|
||||||
|
|
||||||
// Analyze extracts weighted keywords from title, description, and body text.
|
// Analyze 从标题、描述、正文三段文本中提取关键词并计算综合权重。
|
||||||
// Returns a slice sorted by weight descending.
|
// 标题权重 1.0,描述权重 0.5,正文权重 1.0,三者权重相加后排序返回。
|
||||||
|
// 返回按权重降序排列的关键词切片。
|
||||||
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
|
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
|
||||||
|
// 分别计算三段的词权重
|
||||||
maps := []map[string]float32{
|
maps := []map[string]float32{
|
||||||
a.weightedTokens(title, 1.0),
|
a.weightedTokens(title, 1.0), // 标题权重最高
|
||||||
a.weightedTokens(description, 0.5),
|
a.weightedTokens(description, 0.5), // 描述权重中等
|
||||||
a.weightedTokens(text, 1.0),
|
a.weightedTokens(text, 1.0), // 正文权重同标题
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 合并三段权重:先去重建立 key 集合
|
||||||
combined := make(map[string]float32)
|
combined := make(map[string]float32)
|
||||||
for _, m := range maps {
|
for _, m := range maps {
|
||||||
for k := range m {
|
for k := range m {
|
||||||
combined[k] = 0
|
combined[k] = 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 再累加各段权重
|
||||||
for k := range combined {
|
for k := range combined {
|
||||||
for _, m := range maps {
|
for _, m := range maps {
|
||||||
combined[k] += m[k]
|
combined[k] += m[k]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 转换为结果切片
|
||||||
result := make([]Keyword, 0, len(combined))
|
result := make([]Keyword, 0, len(combined))
|
||||||
for k, v := range combined {
|
for k, v := range combined {
|
||||||
result = append(result, Keyword{Word: k, Weight: v})
|
result = append(result, Keyword{Word: k, Weight: v})
|
||||||
}
|
}
|
||||||
|
|
||||||
sortKeywords(result)
|
sortKeywords(result) // 按权重降序
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// Segment returns search-mode tokens for a query string.
|
// Segment 对查询字符串进行搜索模式分词并标准化(用于搜索场景)。
|
||||||
|
// 去除屏蔽词和超长词,返回有效分词列表。
|
||||||
func (a *Analyzer) Segment(query string, searchMode bool) []string {
|
func (a *Analyzer) Segment(query string, searchMode bool) []string {
|
||||||
tokens := a.Tokenize(query, searchMode)
|
tokens := a.Tokenize(query, searchMode)
|
||||||
var result []string
|
var result []string
|
||||||
@@ -182,34 +205,35 @@ func (a *Analyzer) Segment(query string, searchMode bool) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
|
// linguaToISO639 将 lingua 的语言枚举映射为 ISO 639-1 两字母代码。
|
||||||
// Returns "" for unknown or unsupported languages.
|
// 只包含引擎关心的主要语种,未知语种返回空字符串。
|
||||||
var linguaToISO639 = map[lingua.Language]string{
|
var linguaToISO639 = map[lingua.Language]string{
|
||||||
lingua.Chinese: "zh",
|
lingua.Chinese: "zh", // 中文
|
||||||
lingua.English: "en",
|
lingua.English: "en", // 英语
|
||||||
lingua.Japanese: "ja",
|
lingua.Japanese: "ja", // 日语
|
||||||
lingua.Korean: "ko",
|
lingua.Korean: "ko", // 韩语
|
||||||
lingua.French: "fr",
|
lingua.French: "fr", // 法语
|
||||||
lingua.German: "de",
|
lingua.German: "de", // 德语
|
||||||
lingua.Spanish: "es",
|
lingua.Spanish: "es", // 西班牙语
|
||||||
lingua.Portuguese: "pt",
|
lingua.Portuguese: "pt", // 葡萄牙语
|
||||||
lingua.Italian: "it",
|
lingua.Italian: "it", // 意大利语
|
||||||
lingua.Russian: "ru",
|
lingua.Russian: "ru", // 俄语
|
||||||
lingua.Arabic: "ar",
|
lingua.Arabic: "ar", // 阿拉伯语
|
||||||
lingua.Hindi: "hi",
|
lingua.Hindi: "hi", // 印地语
|
||||||
lingua.Dutch: "nl",
|
lingua.Dutch: "nl", // 荷兰语
|
||||||
lingua.Polish: "pl",
|
lingua.Polish: "pl", // 波兰语
|
||||||
lingua.Swedish: "sv",
|
lingua.Swedish: "sv", // 瑞典语
|
||||||
lingua.Turkish: "tr",
|
lingua.Turkish: "tr", // 土耳其语
|
||||||
lingua.Vietnamese: "vi",
|
lingua.Vietnamese: "vi", // 越南语
|
||||||
lingua.Thai: "th",
|
lingua.Thai: "th", // 泰语
|
||||||
lingua.Indonesian: "id",
|
lingua.Indonesian: "id", // 印尼语
|
||||||
lingua.Malay: "ms",
|
lingua.Malay: "ms", // 马来语
|
||||||
}
|
}
|
||||||
|
|
||||||
// DetectLanguage returns the ISO 639-1 language code for the text, or "".
|
// DetectLanguage 检测文本语种,返回 ISO 639-1 两字母代码,或空字符串表示无法判断。
|
||||||
|
// 截断到 2000 字符以提升检测速度(lingua 对长文本处理较慢)。
|
||||||
func (a *Analyzer) DetectLanguage(text string) string {
|
func (a *Analyzer) DetectLanguage(text string) string {
|
||||||
text = strings.ReplaceAll(text, "\n", " ")
|
text = strings.ReplaceAll(text, "\n", " ") // 换行替换为空格
|
||||||
if len(text) > 2000 {
|
if len(text) > 2000 {
|
||||||
text = text[:2000]
|
text = text[:2000]
|
||||||
}
|
}
|
||||||
@@ -218,16 +242,17 @@ func (a *Analyzer) DetectLanguage(text string) string {
|
|||||||
}
|
}
|
||||||
lang, exists := a.detector.DetectLanguageOf(text)
|
lang, exists := a.detector.DetectLanguageOf(text)
|
||||||
if !exists {
|
if !exists {
|
||||||
return ""
|
return "" // 检测失败
|
||||||
}
|
}
|
||||||
if code, ok := linguaToISO639[lang]; ok {
|
if code, ok := linguaToISO639[lang]; ok {
|
||||||
return code
|
return code // 映射为 ISO 代码
|
||||||
}
|
}
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- sorting ----
|
// ---- 排序算法 ----
|
||||||
|
|
||||||
|
// sortKeywords 对关键词切片按权重降序排列(插入排序,适合小规模数据)。
|
||||||
func sortKeywords(kws []Keyword) {
|
func sortKeywords(kws []Keyword) {
|
||||||
for i := 1; i < len(kws); i++ {
|
for i := 1; i < len(kws); i++ {
|
||||||
key := kws[i]
|
key := kws[i]
|
||||||
@@ -240,6 +265,7 @@ func sortKeywords(kws []Keyword) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isASCIIAlnum 判断字符串是否全部由 ASCII 字母或数字组成。
|
||||||
func isASCIIAlnum(s string) bool {
|
func isASCIIAlnum(s string) bool {
|
||||||
for _, r := range s {
|
for _, r := range s {
|
||||||
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
|
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
|
||||||
|
|||||||
+68
-44
@@ -1,41 +1,43 @@
|
|||||||
// Package backlink computes backlink (prosperity) scores for all known domains,
|
// Package backlink computes backlink (prosperity) scores for all known domains,
|
||||||
// using a PageRank-like algorithm over the site-level link graph.
|
// using a PageRank-like algorithm over the site-level link graph.
|
||||||
|
// backlink 包实现 PageRank 类似的反向链接评分算法,在网站级链接图上迭代计算繁荣分数。
|
||||||
//
|
//
|
||||||
// It runs every 48 hours and writes savedata/prosper.json.
|
// 每 48 小时运行一次,将结果写入 savedata/prosper.json,供爬虫调度和搜索排序使用。
|
||||||
package backlink
|
package backlink
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json" // JSON 序列化(输出 prosper.json 和 cos map)
|
||||||
"log"
|
"log" // 日志
|
||||||
"math"
|
"math" // 数学运算(Log、开方、幂)
|
||||||
"math/rand"
|
"math/rand" // 随机数(对高频域名采样降权)
|
||||||
"os"
|
"os" // 文件写入
|
||||||
"path/filepath"
|
"path/filepath" // 路径拼接
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
"time"
|
"time" // 时间计算(下次运行时间、睡眠)
|
||||||
|
|
||||||
"sese-engine/storage"
|
"sese-engine/storage" // 持久化存储
|
||||||
)
|
)
|
||||||
|
|
||||||
// Runner runs the backlink calculation loop.
|
// Runner 管理反向链接计算循环。
|
||||||
type Runner struct {
|
type Runner struct {
|
||||||
db *storage.DB
|
db *storage.DB
|
||||||
storagePath string
|
storagePath string // 存储根目录(用于写入 prosper.json)
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a Runner.
|
// New 创建一个 Runner 实例。
|
||||||
func New(db *storage.DB, storagePath string) *Runner {
|
func New(db *storage.DB, storagePath string) *Runner {
|
||||||
return &Runner{db: db, storagePath: storagePath}
|
return &Runner{db: db, storagePath: storagePath}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run loops forever, recalculating every 48 hours.
|
// Run 无限循环,每 48 小时执行一次反向链接计算。
|
||||||
|
// 每次运行对齐到凌晨 2:00(便于在低峰期执行重计算)。
|
||||||
func (r *Runner) Run() {
|
func (r *Runner) Run() {
|
||||||
for {
|
for {
|
||||||
// Sleep until next scheduled run (aligned to 2am)
|
// 计算距离下次运行(凌晨 2:00)的睡眠时长
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
|
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
|
||||||
if !target.After(now) {
|
if !target.After(now) {
|
||||||
target = target.Add(48 * time.Hour)
|
target = target.Add(48 * time.Hour) // 已过凌晨 2 点,则等明天的 2 点
|
||||||
}
|
}
|
||||||
sleep := target.Sub(now)
|
sleep := target.Sub(now)
|
||||||
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
|
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
|
||||||
@@ -50,46 +52,50 @@ func (r *Runner) Run() {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// RunNow runs one computation cycle immediately (for testing / manual trigger).
|
// RunNow 立即执行一次计算(用于手动触发或测试)。
|
||||||
func (r *Runner) RunNow() error {
|
func (r *Runner) RunNow() error {
|
||||||
return r.compute()
|
return r.compute()
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- computation ----
|
// ---- 计算核心 ----
|
||||||
|
|
||||||
|
// siteStats 存放网站图的统计信息,用于多维度过滤和加权。
|
||||||
type siteStats struct {
|
type siteStats struct {
|
||||||
subdomainCount map[string]int // superDomain → count
|
subdomainCount map[string]int // 顶级域名 → 子域名数量(识别同一组织的多个子站)
|
||||||
templateCount map[string]int // htmlStructure → count
|
templateCount map[string]int // HTML 结构特征 → 出现次数(识别姊妹站点/镜像)
|
||||||
sameIPCount map[string]int // ipPrefix → count
|
sameIPCount map[string]int // IP 前缀 → 网站数量(识别同 IP 上的多个网站)
|
||||||
serverCount map[string]int // serverType → count
|
serverCount map[string]int // Server 类型组合 → 出现次数(识别同服务器部署的网站)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// compute 执行完整的反向链接计算流程。
|
||||||
|
// 包含:统计收集 → HTTPS/HTTP 分别迭代 → 合并 → 写入文件。
|
||||||
func (r *Runner) compute() error {
|
func (r *Runner) compute() error {
|
||||||
stats := r.collectStats()
|
stats := r.collectStats()
|
||||||
|
|
||||||
// Phase 1: HTTPS sites
|
// 阶段一:HTTPS 网站的 PageRank 迭代
|
||||||
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
|
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||||
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
|
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
|
||||||
}, stats, "https_backlink")
|
}, stats, "https_backlink")
|
||||||
|
|
||||||
// Phase 1a: second pass (echo) using d1 scores
|
// 阶段一增强(Echo):用 d1 结果加权再做一轮迭代,放大已有繁荣值的域名
|
||||||
d1a := r.aggregateWithScores(d1, stats, "echo")
|
d1a := r.aggregateWithScores(d1, stats, "echo")
|
||||||
|
|
||||||
// Phase 2: HTTP-only sites
|
// 阶段二:HTTP only 网站的迭代(独立计算,不混入 HTTPS 分数)
|
||||||
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
|
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||||
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
|
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
|
||||||
}, stats, "http_backlink")
|
}, stats, "http_backlink")
|
||||||
|
|
||||||
// Merge
|
// 三路合并:HTTPS 分数主导,Echo 辅助,HTTP 补充
|
||||||
merged := make(map[string]float64)
|
merged := make(map[string]float64)
|
||||||
for k := range union(d1, d2, d1a) {
|
for k := range union(d1, d2, d1a) {
|
||||||
|
// 混合公式:HTTPS × 1 + Echo × 1 + min(HTTPS×0.5 + HTTP×0.1, HTTP)
|
||||||
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
|
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
|
||||||
if v > 0.16 {
|
if v > 0.16 {
|
||||||
merged[k] = v
|
merged[k] = v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save
|
// 写入文件
|
||||||
path := filepath.Join(r.storagePath, "prosper.json")
|
path := filepath.Join(r.storagePath, "prosper.json")
|
||||||
if err := writeJSON(path, merged); err != nil {
|
if err := writeJSON(path, merged); err != nil {
|
||||||
return err
|
return err
|
||||||
@@ -98,7 +104,8 @@ func (r *Runner) compute() error {
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// collectStats builds statistics about the site graph.
|
// collectStats 遍历所有网站元信息,统计子域名、HTML 模板、IP、Server 类型分布。
|
||||||
|
// 低于阈值(4)的统计项被剔除,以减少噪声影响。
|
||||||
func (r *Runner) collectStats() *siteStats {
|
func (r *Runner) collectStats() *siteStats {
|
||||||
stats := &siteStats{
|
stats := &siteStats{
|
||||||
subdomainCount: make(map[string]int),
|
subdomainCount: make(map[string]int),
|
||||||
@@ -125,7 +132,7 @@ func (r *Runner) collectStats() *siteStats {
|
|||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
// Prune counts below threshold
|
// 剔除低频统计项
|
||||||
for k, v := range stats.subdomainCount {
|
for k, v := range stats.subdomainCount {
|
||||||
if v < 4 {
|
if v < 4 {
|
||||||
delete(stats.subdomainCount, k)
|
delete(stats.subdomainCount, k)
|
||||||
@@ -144,13 +151,14 @@ func (r *Runner) collectStats() *siteStats {
|
|||||||
return stats
|
return stats
|
||||||
}
|
}
|
||||||
|
|
||||||
// aggregate computes a backlink score map for sites matching the filter.
|
// aggregate 执行一轮 PageRank 风格的链接权重迭代。
|
||||||
|
// filter 筛选纳入计算的目标网站集合;desc 为日志标识。
|
||||||
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
|
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
|
||||||
log.Printf("[backlink] aggregating: %s", desc)
|
log.Printf("[backlink] aggregating: %s", desc)
|
||||||
d := make(map[string]float64)
|
d := make(map[string]float64)
|
||||||
ipSource := make(map[string]float64)
|
ipSource := make(map[string]float64)
|
||||||
|
|
||||||
// Build server type index (top 63 most common)
|
// 建立 Server 类型的 ID 映射表(最多 63 种,用于构建向量)
|
||||||
serverTable := buildServerTable(stats.serverCount)
|
serverTable := buildServerTable(stats.serverCount)
|
||||||
|
|
||||||
type vectorEntry struct {
|
type vectorEntry struct {
|
||||||
@@ -166,7 +174,7 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
if filter != nil && !filter(info) {
|
if filter != nil && !filter(info) {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
mul := computeMul(host, info, stats)
|
mul := computeMul(host, info, stats) // 计算域名综合乘数(时间衰减 + 子域名降权)
|
||||||
if mul == 0 {
|
if mul == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@@ -176,6 +184,7 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 每条出站链接的初始权重:1/max(n, 50),出站越多每条分得越少
|
||||||
w := 1.0 / math.Max(float64(n), 50)
|
w := 1.0 / math.Max(float64(n), 50)
|
||||||
xd := make(map[string]float64)
|
xd := make(map[string]float64)
|
||||||
for _, link := range info.OutLinks {
|
for _, link := range info.OutLinks {
|
||||||
@@ -196,10 +205,11 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
serverID := serverTable[serverType]
|
serverID := serverTable[serverType]
|
||||||
|
|
||||||
for seg, segW := range xd {
|
for seg, segW := range xd {
|
||||||
fw := math.Min(segW, 0.15) * mul
|
fw := math.Min(segW, 0.15) * mul // 截断上限 0.15,防止单链接权重过高
|
||||||
prev := d[seg]
|
prev := d[seg]
|
||||||
d[seg] = prev + fw
|
d[seg] = prev + fw
|
||||||
|
|
||||||
|
// IP 来源去重:来自同一 IP 段的高权重链接在超过 0.4 后跳过,防止 IP 污染
|
||||||
if prev > 0.2 {
|
if prev > 0.2 {
|
||||||
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
|
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
|
||||||
key := seg + "-" + ipStr
|
key := seg + "-" + ipStr
|
||||||
@@ -210,6 +220,7 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 构建向量:域名 → Server 类型向量(用于余弦相似度过滤)
|
||||||
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
||||||
if vectors[seg] == nil {
|
if vectors[seg] == nil {
|
||||||
vectors[seg] = make([]float32, 64)
|
vectors[seg] = make([]float32, 64)
|
||||||
@@ -219,8 +230,8 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
}
|
}
|
||||||
|
|
||||||
i++
|
i++
|
||||||
|
// 每 20 万条遍历后清理低分条目,防止内存膨胀
|
||||||
if i%200000 == 0 {
|
if i%200000 == 0 {
|
||||||
// Prune low-score entries
|
|
||||||
for k, v := range d {
|
for k, v := range d {
|
||||||
if v < pruneThreshold {
|
if v < pruneThreshold {
|
||||||
delete(d, k)
|
delete(d, k)
|
||||||
@@ -238,10 +249,10 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
return nil
|
return nil
|
||||||
})
|
})
|
||||||
|
|
||||||
// Vectorised cosine filtering
|
// 向量余弦过滤:去除 Server 类型特征偏离核心向量的域名(可能是噪音/作弊)
|
||||||
d = vectorFilter(d, vectors, desc)
|
d = vectorFilter(d, vectors, desc)
|
||||||
|
|
||||||
// Prune
|
// 最终清理:分数 ≤ 0.16 的域名不写入(低于此阈值认为不繁荣)
|
||||||
for k, v := range d {
|
for k, v := range d {
|
||||||
if v <= 0.16 {
|
if v <= 0.16 {
|
||||||
delete(d, k)
|
delete(d, k)
|
||||||
@@ -252,7 +263,8 @@ func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats
|
|||||||
return d
|
return d
|
||||||
}
|
}
|
||||||
|
|
||||||
// aggregateWithScores does a second pass weighted by existing scores.
|
// aggregateWithScores 在已有繁荣分数的基础上加权再做一轮迭代(Echo 阶段)。
|
||||||
|
// 对已有分数的域名给予更高权重(乘以 log2(2+score)),使强者更强。
|
||||||
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
|
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
|
||||||
log.Printf("[backlink] aggregating with scores: %s", desc)
|
log.Printf("[backlink] aggregating with scores: %s", desc)
|
||||||
d := make(map[string]float64)
|
d := make(map[string]float64)
|
||||||
@@ -268,6 +280,7 @@ func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats
|
|||||||
if mul == 0 {
|
if mul == 0 {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
// 已有分数的域名获得加权乘数(上限 2×)
|
||||||
trueMul := math.Min(2, mul*math.Log2(2+score))
|
trueMul := math.Min(2, mul*math.Log2(2+score))
|
||||||
|
|
||||||
n := len(info.OutLinks)
|
n := len(info.OutLinks)
|
||||||
@@ -309,10 +322,12 @@ func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats
|
|||||||
return d
|
return d
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- vector cosine filtering ----
|
// ---- 向量余弦过滤 ----
|
||||||
|
|
||||||
|
// vectorFilter 使用余弦相似度过滤域名分数:保留与核心 Server 类型向量相似的域名。
|
||||||
|
// 与核心方向偏离的域名可能是噪音(如作弊农场、链接买卖)。
|
||||||
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
|
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
|
||||||
// Compute core vector (sum of all)
|
// 计算全网站的 Server 类型核心向量(所有向量求和)
|
||||||
core := make([]float64, 64)
|
core := make([]float64, 64)
|
||||||
for _, vec := range vectors {
|
for _, vec := range vectors {
|
||||||
for j, v := range vec {
|
for j, v := range vec {
|
||||||
@@ -334,10 +349,12 @@ func vectorFilter(d map[string]float64, vectors map[string][]float32, desc strin
|
|||||||
newD[k] = v
|
newD[k] = v
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
// 余弦相似度:范围 [-1, 1]
|
||||||
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
|
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
|
||||||
if cos > 1.01 {
|
if cos > 1.01 {
|
||||||
cos = 1.01
|
cos = 1.01
|
||||||
}
|
}
|
||||||
|
// cos × 0.75 + 0.25:确保最低也有 0.25 的权重,不完全剔除
|
||||||
newV := math.Max(v*(0.25+cos*0.75), 0.21)
|
newV := math.Max(v*(0.25+cos*0.75), 0.21)
|
||||||
newD[k] = newV
|
newD[k] = newV
|
||||||
} else {
|
} else {
|
||||||
@@ -345,7 +362,7 @@ func vectorFilter(d map[string]float64, vectors map[string][]float32, desc strin
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Save cos map for diagnostics
|
// 保存 cos map 用于诊断
|
||||||
cosMap := make(map[string]float64)
|
cosMap := make(map[string]float64)
|
||||||
for k, vec := range vectors {
|
for k, vec := range vectors {
|
||||||
vn := float64(norm32(vec))
|
vn := float64(norm32(vec))
|
||||||
@@ -358,8 +375,10 @@ func vectorFilter(d map[string]float64, vectors map[string][]float32, desc strin
|
|||||||
return newD
|
return newD
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- helpers ----
|
// ---- 辅助函数 ----
|
||||||
|
|
||||||
|
// computeMul 计算某网站在繁荣值计算中的综合乘数。
|
||||||
|
// 综合考虑:最后访问时间(超过 180 天排除)、子域名数量(越多平均分越低)。
|
||||||
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||||
if len(info.OutLinks) == 0 {
|
if len(info.OutLinks) == 0 {
|
||||||
return 0
|
return 0
|
||||||
@@ -370,7 +389,7 @@ func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
|||||||
}
|
}
|
||||||
days := (time.Now().Unix() - t) / (3600 * 24)
|
days := (time.Now().Unix() - t) / (3600 * 24)
|
||||||
if days > 180 {
|
if days > 180 {
|
||||||
return 0
|
return 0 // 半年未更新,排除
|
||||||
}
|
}
|
||||||
timeMul := math.Pow(0.99, float64(days))
|
timeMul := math.Pow(0.99, float64(days))
|
||||||
|
|
||||||
@@ -381,6 +400,7 @@ func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
|||||||
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
|
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
|
||||||
}
|
}
|
||||||
count := max(subCount, int(float64(tplCount)*1.5))
|
count := max(subCount, int(float64(tplCount)*1.5))
|
||||||
|
// 高频域名随机丢弃:保持最多 1000 个域名参与计算(减少重复镜像的投票)
|
||||||
if count > 1000 {
|
if count > 1000 {
|
||||||
if rand.Float64() > 1000.0/float64(count) {
|
if rand.Float64() > 1000.0/float64(count) {
|
||||||
return 0
|
return 0
|
||||||
@@ -391,6 +411,7 @@ func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
|||||||
return timeMul * domainMul
|
return timeMul * domainMul
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// superDomain 提取顶级域名(去除子域名)。
|
||||||
func superDomain(host string) string {
|
func superDomain(host string) string {
|
||||||
parts := strings.Split(host, ".")
|
parts := strings.Split(host, ".")
|
||||||
if len(parts) >= 2 {
|
if len(parts) >= 2 {
|
||||||
@@ -399,6 +420,7 @@ func superDomain(host string) string {
|
|||||||
return host
|
return host
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ipPrefix 将 IP 列表去重排序后返回逗号拼接的 /24 前缀(用于识别同 C 段主机)。
|
||||||
func ipPrefix(ips []string) string {
|
func ipPrefix(ips []string) string {
|
||||||
if len(ips) == 0 {
|
if len(ips) == 0 {
|
||||||
return ""
|
return ""
|
||||||
@@ -408,7 +430,7 @@ func ipPrefix(ips []string) string {
|
|||||||
for i, ip := range sorted {
|
for i, ip := range sorted {
|
||||||
idx := strings.LastIndex(ip, ".")
|
idx := strings.LastIndex(ip, ".")
|
||||||
if idx > 0 {
|
if idx > 0 {
|
||||||
parts[i] = ip[:idx]
|
parts[i] = ip[:idx] // 取 /24 前缀
|
||||||
} else {
|
} else {
|
||||||
parts[i] = ip
|
parts[i] = ip
|
||||||
}
|
}
|
||||||
@@ -416,6 +438,7 @@ func ipPrefix(ips []string) string {
|
|||||||
return strings.Join(parts, ",")
|
return strings.Join(parts, ",")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// decomposeURL 将 URL 分解为递增路径段(同 info 包)。
|
||||||
func decomposeURL(rawURL string) []string {
|
func decomposeURL(rawURL string) []string {
|
||||||
u := strings.ToLower(rawURL)
|
u := strings.ToLower(rawURL)
|
||||||
if strings.HasPrefix(u, "https://") {
|
if strings.HasPrefix(u, "https://") {
|
||||||
@@ -442,6 +465,7 @@ func decomposeURL(rawURL string) []string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buildServerTable 将 Server 类型按频率降序排列,取前 63 种分配 ID(0 不用)。
|
||||||
func buildServerTable(serverCount map[string]int) map[string]int {
|
func buildServerTable(serverCount map[string]int) map[string]int {
|
||||||
type kv struct {
|
type kv struct {
|
||||||
k string
|
k string
|
||||||
@@ -449,7 +473,7 @@ func buildServerTable(serverCount map[string]int) map[string]int {
|
|||||||
}
|
}
|
||||||
var sorted []kv
|
var sorted []kv
|
||||||
for k, v := range serverCount {
|
for k, v := range serverCount {
|
||||||
sorted = append(sorted, kv{k, v})
|
sorted = append(sorted, kv{k: k, v: v})
|
||||||
}
|
}
|
||||||
for i := 0; i < len(sorted)-1; i++ {
|
for i := 0; i < len(sorted)-1; i++ {
|
||||||
for j := i + 1; j < len(sorted); j++ {
|
for j := i + 1; j < len(sorted); j++ {
|
||||||
|
|||||||
+33
-25
@@ -1,53 +1,61 @@
|
|||||||
// Package config holds all global configuration parameters for sese-engine.
|
// Package config holds all global configuration parameters for sese-engine.
|
||||||
|
// config 包存放 sese-engine 的所有全局配置参数。
|
||||||
package config
|
package config
|
||||||
|
|
||||||
// Index / storage limits
|
// Index / storage limits
|
||||||
|
// 索引 / 存储相关限制常量
|
||||||
const (
|
const (
|
||||||
MaxURLsPerKey = 11000 // max URLs stored per index key
|
MaxURLsPerKey = 11000 // 每个索引词最多保存的 URL 数量上限
|
||||||
MaxSameDomainPerKey = 20 // max URLs from the same domain per key
|
MaxSameDomainPerKey = 20 // 同一域名在每个索引词下最多出现的次数
|
||||||
BigCleanThreshold = 10000000 // flush in-memory index after this many rows
|
BigCleanThreshold = 10000000 // 内存中累计多少条索引后触发一次刷盘清理
|
||||||
MaxNewURLsPerKey = 10000 // cap on new URLs added per key per flush
|
MaxNewURLsPerKey = 10000 // 每次刷盘时,每个索引词最多写入的新 URL 数量上限
|
||||||
MinURLsForNewKey = 3 // discard new keys with fewer than this many URLs
|
MinURLsForNewKey = 3 // 新索引词如果 URL 数少于该值则丢弃,不写入磁盘
|
||||||
)
|
)
|
||||||
|
|
||||||
// Crawler settings
|
// Crawler settings
|
||||||
|
// 爬虫行为相关配置
|
||||||
const (
|
const (
|
||||||
SpiderName = "loli_spider"
|
SpiderName = "loli_spider" // HTTP 请求的 User-Agent 标识
|
||||||
CrawlerCooldown = 3 // seconds between requests to the same host
|
CrawlerCooldown = 3 // 同一主机相邻两次请求的最小间隔(秒),用于遵守 robots.txt 和避免被封
|
||||||
CrawlerWorkers = 22 // goroutine pool size for crawling
|
CrawlerWorkers = 22 // 爬虫并发 goroutine 数量
|
||||||
CrawlFocus = 0.7 // concentration factor — higher = more focused on single domain
|
CrawlFocus = 0.7 // 域名集中度因子,越大越倾向在少量域名内深挖,越小越分散
|
||||||
MaxKeywordsPerPage = 250
|
MaxKeywordsPerPage = 250 // 单个页面最多提取的关键词数量
|
||||||
MaxEpoch = 100
|
MaxEpoch = 100 // BFS 爬取的最大轮次上限
|
||||||
ExpectedProsperRatio = 0.6 // fraction of queue that should be "prosperous" (high backlink) domains
|
ExpectedProsperRatio = 0.6 // 队列中预期"繁荣"域名(高反向链接)的占比,用于调度决策
|
||||||
EntryURL = "https://zh.wikipedia.org/"
|
EntryURL = "https://zh.wikipedia.org/" // BFS 爬取的起始入口 URL
|
||||||
)
|
)
|
||||||
|
|
||||||
// Search / ranking weights
|
// Search / ranking weights
|
||||||
|
// 搜索结果排序权重配置
|
||||||
const (
|
const (
|
||||||
UseOnlineSnippet = true
|
UseOnlineSnippet = true // 是否在线抓取摘要(搜索时实时抓取页面补充摘要)
|
||||||
OnlineSnippetTimeout = 3 // seconds
|
OnlineSnippetTimeout = 3 // 在线抓取摘要的超时时间(秒)
|
||||||
WeightDailyDecay = 0.996
|
WeightDailyDecay = 0.996 // 页面年龄的时间衰减因子(每天乘以此系数)
|
||||||
LanguageWeight = 0.5
|
LanguageWeight = 0.5 // 语种匹配权重:与查询语种一致时加分
|
||||||
ConsecutiveKeyWeight = 1.3
|
ConsecutiveKeyWeight = 1.3 // 连续关键词命中权重:多词连续出现时加分
|
||||||
BacklinkWeight = 1.0
|
BacklinkWeight = 1.0 // 反向链接权重:指向该 URL 的链接越多得分越高
|
||||||
SearchServerPort = 80
|
SearchServerPort = 80 // 搜索服务的 HTTP 监听端口
|
||||||
)
|
)
|
||||||
|
|
||||||
// Backlink computation
|
// Backlink computation
|
||||||
|
// 反向链接(PageRank 类)计算相关常量
|
||||||
const (
|
const (
|
||||||
BacklinkBaseline = 200000 // normalization divisor for backlink scores
|
BacklinkBaseline = 200000 // 反向链接得分归一化的除数(用于将原始链接数映射到 [0,1] 区间)
|
||||||
)
|
)
|
||||||
|
|
||||||
// Storage path (relative to process working directory)
|
// Storage path (relative to process working directory)
|
||||||
|
// 存储根目录路径,相对于进程启动时的工作目录
|
||||||
const StoragePath = "./savedata"
|
const StoragePath = "./savedata"
|
||||||
|
|
||||||
// Prometheus ports
|
// Prometheus ports
|
||||||
|
// 各模块 Prometheus 监控指标的 HTTP 端口
|
||||||
const (
|
const (
|
||||||
PromPortCrawler = 14950
|
PromPortCrawler = 14950 // 爬虫模块的 metrics 端口
|
||||||
PromPortHarvester = 14951
|
PromPortHarvester = 14951 // 收获服务器模块的 metrics 端口
|
||||||
PromPortBacklink = 14952
|
PromPortBacklink = 14952 // 反向链接计算模块的 metrics 端口
|
||||||
PromPortSearch = 14953
|
PromPortSearch = 14953 // 搜索服务模块的 metrics 端口
|
||||||
)
|
)
|
||||||
|
|
||||||
// Harvester HTTP endpoint
|
// Harvester HTTP endpoint
|
||||||
|
// 爬虫向收获服务器发送索引数据的 HTTP 端点地址
|
||||||
const HarvesterAddr = "http://127.0.0.1:5000"
|
const HarvesterAddr = "http://127.0.0.1:5000"
|
||||||
|
|||||||
+124
-69
@@ -1,43 +1,44 @@
|
|||||||
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
|
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
|
||||||
|
// crawler 包的主逻辑:BFS 爬取循环、URL 调度算法、网站元信息更新。
|
||||||
package crawler
|
package crawler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"bytes"
|
"bytes" // 字节缓冲(构造 HTTP POST 请求体)
|
||||||
"encoding/json"
|
"encoding/json" // JSON 序列化(发送关键词数据到 harvester)
|
||||||
"log"
|
"log" // 日志输出
|
||||||
"math"
|
"math" // 数学运算(指数衰减、质量评分)
|
||||||
"math/rand"
|
"math/rand" // 随机数(加权采样、队列打乱)
|
||||||
"net/http"
|
"net/http" // HTTP 客户端(POST 数据到 harvester)
|
||||||
"net/url"
|
"net/url" // URL 解析
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
"sync"
|
"sync" // 互斥锁(保护并发收集结果)
|
||||||
"sync/atomic"
|
"sync/atomic" // 原子操作(计数器,无锁并发更新)
|
||||||
"time"
|
"time" // 时间戳
|
||||||
|
|
||||||
"sese-engine/analyzer"
|
"sese-engine/analyzer" // 文本分析和关键词提取
|
||||||
"sese-engine/config"
|
"sese-engine/config" // 全局配置常量
|
||||||
"sese-engine/parser"
|
"sese-engine/parser" // HTML 解析(提取标题、正文、链接)
|
||||||
"sese-engine/storage"
|
"sese-engine/storage" // 持久化存储
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Stats 存放爬虫实时统计计数器(使用 atomic 原子读取)。
|
||||||
// Stats holds real-time crawl counters (read with atomic).
|
|
||||||
type Stats struct {
|
type Stats struct {
|
||||||
VisitedURLs int64
|
VisitedURLs int64 // 已访问的 URL 总数(含失败)
|
||||||
SuccessURLs int64
|
SuccessURLs int64 // 成功抓取(HTTP 200)的 URL 数
|
||||||
KeywordsFetched int64
|
KeywordsFetched int64 // 累计提取的关键词总数
|
||||||
}
|
}
|
||||||
|
|
||||||
// Crawler orchestrates the BFS crawl.
|
// Crawler 编排整个 BFS 爬取流程。
|
||||||
type Crawler struct {
|
type Crawler struct {
|
||||||
fetcher *Fetcher
|
fetcher *Fetcher // HTTP 抓取器(含 robots.txt 和限流)
|
||||||
db *storage.DB
|
db *storage.DB // 持久化数据库
|
||||||
analyzer *analyzer.Analyzer
|
analyzer *analyzer.Analyzer // 分词和关键词分析
|
||||||
prosperMap map[string]float64 // domain → backlink score (loaded from info)
|
prosperMap map[string]float64 // 域名 → 反向链接繁荣值(来自 info 模块,越大越"有价值")
|
||||||
stats Stats
|
stats Stats // 原子计数器
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a Crawler.
|
// New 创建一个 Crawler 实例。
|
||||||
|
// prosperMap 由 info 模块加载,传入域名繁荣值用于调度优先级计算。
|
||||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||||
return &Crawler{
|
return &Crawler{
|
||||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
||||||
@@ -47,40 +48,46 @@ func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *C
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// URLWeight pairs a URL with its discovery weight.
|
// URLWeight 将 URL 和发现权重打包在一起,用于调度决策。
|
||||||
type URLWeight struct {
|
type URLWeight struct {
|
||||||
URL string
|
URL string // 待访问的 URL
|
||||||
Weight float64
|
Weight float64 // 发现权重(从父页面分得的"关注度",页面链接越多则每个分得越少)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
|
// Run 启动 BFS 爬取,从 entryURL 开始,执行最多 maxEpoch 轮。
|
||||||
// It blocks until completion.
|
// 各轮之间是串行的,每轮内并发抓取,按调度算法选择下一轮 URL。
|
||||||
func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||||
visited := make(map[string]bool)
|
visited := make(map[string]bool) // 已访问 URL 集合(防止重复抓取)
|
||||||
queue := []string{entryURL}
|
queue := []string{entryURL} // 当前轮次的待抓取队列
|
||||||
|
|
||||||
for ep := 0; ep < maxEpoch; ep++ {
|
for ep := 0; ep < maxEpoch; ep++ {
|
||||||
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
|
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
|
||||||
|
// 将本轮所有 URL 标记为已访问(防止下一轮重复入队)
|
||||||
for _, u := range queue {
|
for _, u := range queue {
|
||||||
visited[u] = true
|
visited[u] = true
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 并发抓取本轮所有 URL
|
||||||
var (
|
var (
|
||||||
newLinks []URLWeight
|
newLinks []URLWeight // 收集下一轮候选 URL
|
||||||
mu sync.Mutex
|
mu sync.Mutex // 保护 newLinks 的并发写入
|
||||||
wg sync.WaitGroup
|
wg sync.WaitGroup
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// 信号量:限制同时并发数不超过配置的工作线程数
|
||||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
sem := make(chan struct{}, config.CrawlerWorkers)
|
||||||
for _, u := range queue {
|
for _, u := range queue {
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
sem <- struct{}{}
|
sem <- struct{}{} // 获取一个令牌(阻塞直到有空闲槽位)
|
||||||
go func(rawURL string) {
|
go func(rawURL string) {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
defer func() { <-sem }()
|
defer func() { <-sem }() // 释放令牌
|
||||||
|
|
||||||
|
// 抓取单个 URL,返回发现的子链接
|
||||||
hrefs := c.visitURL(rawURL)
|
hrefs := c.visitURL(rawURL)
|
||||||
n := len(hrefs)
|
n := len(hrefs)
|
||||||
if n > 0 {
|
if n > 0 {
|
||||||
|
// 每个子链接分得 1/n 的父页面权重
|
||||||
w := 1.0 / float64(n)
|
w := 1.0 / float64(n)
|
||||||
mu.Lock()
|
mu.Lock()
|
||||||
for _, h := range hrefs {
|
for _, h := range hrefs {
|
||||||
@@ -94,30 +101,34 @@ func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
|||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
|
// 本轮没有发现新链接,爬取结束
|
||||||
if len(newLinks) == 0 {
|
if len(newLinks) == 0 {
|
||||||
log.Println("[crawler] empty queue — stopping")
|
log.Println("[crawler] empty queue — stopping")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 调度算法:从候选 URL 中选出下一轮要抓取的队列
|
||||||
queue = c.schedule(newLinks)
|
queue = c.schedule(newLinks)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
|
// visitURL 抓取一个 URL,提取关键词、缓存摘要、更新网站元信息,返回页面中发现的子链接。
|
||||||
func (c *Crawler) visitURL(rawURL string) []string {
|
func (c *Crawler) visitURL(rawURL string) []string {
|
||||||
atomic.AddInt64(&c.stats.VisitedURLs, 1)
|
atomic.AddInt64(&c.stats.VisitedURLs, 1) // 计数器 +1
|
||||||
|
|
||||||
|
// 礼貌模式抓取(遵守 robots.txt + 限流),超时 10 秒,不限制大小
|
||||||
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
|
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
|
||||||
if err != nil || res == nil {
|
if err != nil || res == nil {
|
||||||
c.updateSiteFailure(rawURL)
|
c.updateSiteFailure(rawURL) // 记录失败,更新该网站成功率
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
atomic.AddInt64(&c.stats.SuccessURLs, 1)
|
atomic.AddInt64(&c.stats.SuccessURLs, 1) // 成功计数器 +1
|
||||||
|
|
||||||
|
// 解析 HTML:提取标题、描述、正文和所有超链接
|
||||||
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
|
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
|
||||||
|
|
||||||
// Cache snippet
|
// 缓存 URL 摘要(仅对短 URL 缓存,防止超长 URL 浪费空间)
|
||||||
if len(res.FinalURL) < 250 {
|
if len(res.FinalURL) < 250 {
|
||||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||||
Title: title,
|
Title: title,
|
||||||
@@ -127,21 +138,23 @@ func (c *Crawler) visitURL(rawURL string) []string {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Keyword extraction → send to harvester
|
// 关键词提取:将标题/描述/正文交给 analyzer 计算关键词权重
|
||||||
kws := c.analyzer.Analyze(title, desc, text)
|
kws := c.analyzer.Analyze(title, desc, text)
|
||||||
if len(kws) > 0 {
|
if len(kws) > 0 {
|
||||||
|
// 限制每个页面最多发送的关键词数量
|
||||||
if len(kws) > config.MaxKeywordsPerPage {
|
if len(kws) > config.MaxKeywordsPerPage {
|
||||||
kws = kws[:config.MaxKeywordsPerPage]
|
kws = kws[:config.MaxKeywordsPerPage]
|
||||||
}
|
}
|
||||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||||
|
// 异步发送到收获服务器写入倒排索引(不阻塞爬取流程)
|
||||||
go c.sendToHarvester(res.FinalURL, kws)
|
go c.sendToHarvester(res.FinalURL, kws)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update site info
|
// 更新网站元信息(成功访问)
|
||||||
host := netloc(res.FinalURL)
|
host := netloc(res.FinalURL)
|
||||||
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
|
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
|
||||||
|
|
||||||
// Handle permanent redirects in site info
|
// 处理永久重定向:更新源主机名下的重定向映射
|
||||||
for from, to := range res.Redirects {
|
for from, to := range res.Redirects {
|
||||||
fromHost := netloc(from)
|
fromHost := netloc(from)
|
||||||
if fromHost == "" {
|
if fromHost == "" {
|
||||||
@@ -152,20 +165,21 @@ func (c *Crawler) visitURL(rawURL string) []string {
|
|||||||
info.Redirects = make(map[string]string)
|
info.Redirects = make(map[string]string)
|
||||||
}
|
}
|
||||||
info.Redirects[from] = to
|
info.Redirects[from] = to
|
||||||
|
// 重定向映射过多时裁剪到 40 条
|
||||||
if len(info.Redirects) > 50 {
|
if len(info.Redirects) > 50 {
|
||||||
// keep most important (just truncate randomly for now)
|
|
||||||
info.Redirects = truncateMap(info.Redirects, 40)
|
info.Redirects = truncateMap(info.Redirects, 40)
|
||||||
}
|
}
|
||||||
_ = c.db.SetSiteInfo(fromHost, info)
|
_ = c.db.SetSiteInfo(fromHost, info)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trim hrefs
|
// 限制返回的链接数,防止下一轮队列爆炸
|
||||||
if len(hrefs) > 100 {
|
if len(hrefs) > 100 {
|
||||||
hrefs = sampleStrings(hrefs, 100)
|
hrefs = sampleStrings(hrefs, 100)
|
||||||
}
|
}
|
||||||
return hrefs
|
return hrefs
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// updateSiteFailure 当某 URL 抓取失败时,更新该网站的访问成功率(指数衰减)。
|
||||||
func (c *Crawler) updateSiteFailure(rawURL string) {
|
func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||||
host := netloc(rawURL)
|
host := netloc(rawURL)
|
||||||
if host == "" {
|
if host == "" {
|
||||||
@@ -176,27 +190,33 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
|
|||||||
zero := 0.0
|
zero := 0.0
|
||||||
info.SuccessRate = &zero
|
info.SuccessRate = &zero
|
||||||
}
|
}
|
||||||
|
// 成功率每次失败乘以 0.99(无限趋近 0)
|
||||||
*info.SuccessRate *= 0.99
|
*info.SuccessRate *= 0.99
|
||||||
_ = c.db.SetSiteInfo(host, info)
|
_ = c.db.SetSiteInfo(host, info)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// updateSiteSuccess 当某 URL 抓取成功时,更新网站的完整元信息。
|
||||||
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
|
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
|
||||||
info, _ := c.db.GetSiteInfo(host)
|
info, _ := c.db.GetSiteInfo(host)
|
||||||
|
|
||||||
|
// 访问计数 +1,更新最后访问时间
|
||||||
info.VisitCount++
|
info.VisitCount++
|
||||||
info.LastVisitTime = time.Now().Unix()
|
info.LastVisitTime = time.Now().Unix()
|
||||||
|
|
||||||
|
// 成功率更新:EWM(指数加权移动)平滑,每次 +0.01
|
||||||
one := 1.0
|
one := 1.0
|
||||||
if info.SuccessRate == nil {
|
if info.SuccessRate == nil {
|
||||||
info.SuccessRate = &one
|
info.SuccessRate = &one
|
||||||
}
|
}
|
||||||
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
||||||
|
|
||||||
|
// 记录是否支持 HTTPS
|
||||||
if strings.HasPrefix(res.FinalURL, "https://") {
|
if strings.HasPrefix(res.FinalURL, "https://") {
|
||||||
t := true
|
t := true
|
||||||
info.HTTPSAvailable = &t
|
info.HTTPSAvailable = &t
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 记录 HTTP Server 类型(去重,保留最近 5 个)
|
||||||
if res.ServerType != "" {
|
if res.ServerType != "" {
|
||||||
found := false
|
found := false
|
||||||
for _, s := range info.ServerTypes {
|
for _, s := range info.ServerTypes {
|
||||||
@@ -213,20 +233,22 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Language detection — sample 10% or first 10 visits
|
// 语言检测和出站链接收集(仅在前 10 次访问或 10% 概率下触发,减少开销)
|
||||||
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
||||||
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
||||||
if lang != "" {
|
if lang != "" {
|
||||||
if info.Languages == nil {
|
if info.Languages == nil {
|
||||||
info.Languages = make(map[string]float64)
|
info.Languages = make(map[string]float64)
|
||||||
}
|
}
|
||||||
|
// 首次访问强度高,随访问次数增加强度衰减
|
||||||
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
||||||
for k := range info.Languages {
|
for k := range info.Languages {
|
||||||
info.Languages[k] *= (1 - intensity)
|
info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
|
||||||
}
|
}
|
||||||
info.Languages[lang] += intensity
|
info.Languages[lang] += intensity // 新语种增加
|
||||||
}
|
}
|
||||||
// Collect external links
|
|
||||||
|
// 收集外链(跨顶级域名的链接)
|
||||||
superHost := superNetloc(res.FinalURL)
|
superHost := superNetloc(res.FinalURL)
|
||||||
var external []string
|
var external []string
|
||||||
for _, h := range hrefs {
|
for _, h := range hrefs {
|
||||||
@@ -234,8 +256,10 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
|||||||
external = append(external, h)
|
external = append(external, h)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 最多保留 10 条外链
|
||||||
sampled := sampleStrings(external, 10)
|
sampled := sampleStrings(external, 10)
|
||||||
info.OutLinks = append(info.OutLinks, sampled...)
|
info.OutLinks = append(info.OutLinks, sampled...)
|
||||||
|
// 外链超过 250 条时采样到 200 条
|
||||||
if len(info.OutLinks) > 250 {
|
if len(info.OutLinks) > 250 {
|
||||||
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
||||||
}
|
}
|
||||||
@@ -244,7 +268,7 @@ func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc,
|
|||||||
_ = c.db.SetSiteInfo(host, info)
|
_ = c.db.SetSiteInfo(host, info)
|
||||||
}
|
}
|
||||||
|
|
||||||
// sendToHarvester POSTs keyword data to the harvester service.
|
// sendToHarvester 将关键词索引数据通过 HTTP POST 发送到收获服务器(:5000/l 端点)。
|
||||||
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||||
type payload struct {
|
type payload struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"`
|
||||||
@@ -263,13 +287,15 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
|||||||
resp.Body.Close()
|
resp.Body.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
// schedule selects and prioritises the next BFS queue from raw discovered links.
|
// schedule 从候选 URL 集合中选出下一轮 BFS 队列。
|
||||||
|
// 包含:域名集中度过滤、HTTP/HTTPS 比例控制、繁荣 URL 占比控制、加权随机采样。
|
||||||
func (c *Crawler) schedule(links []URLWeight) []string {
|
func (c *Crawler) schedule(links []URLWeight) []string {
|
||||||
|
// 候选过多时先随机采样到 10 万条,防止内存爆炸
|
||||||
if len(links) > 100000 {
|
if len(links) > 100000 {
|
||||||
links = sampleURLWeights(links, 100000)
|
links = sampleURLWeights(links, 100000)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Pre-fetch site info for all involved domains
|
// 预加载所有涉及的网站信息(加速后续评分计算)
|
||||||
domains := make(map[string]bool)
|
domains := make(map[string]bool)
|
||||||
for _, lw := range links {
|
for _, lw := range links {
|
||||||
if h := netloc(lw.URL); h != "" {
|
if h := netloc(lw.URL); h != "" {
|
||||||
@@ -294,20 +320,20 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
|||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
// Score each URL
|
// 对所有候选 URL 逐一计算调度优先级分数
|
||||||
scored_list := make([]scoredURL, len(links))
|
scored_list := make([]scoredURL, len(links))
|
||||||
for i, lw := range links {
|
for i, lw := range links {
|
||||||
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
|
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Weighted random sample (45000 or 1/3+250 whichever smaller)
|
// 加权随机采样:从高分到低分按权重概率抽取最多 k 条
|
||||||
k := min(45000, len(scored_list)/3+250)
|
k := min(45000, len(scored_list)/3+250)
|
||||||
selected := weightedSample(scored_list, k)
|
selected := weightedSample(scored_list, k)
|
||||||
|
|
||||||
// Domain concentration filtering
|
// 域名集中度过滤:限制每个域名被选中的数量,防止被少数网站垄断
|
||||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
selected = concentrationFilter(selected, config.CrawlFocus)
|
||||||
|
|
||||||
// Separate https/http, cap http at 1/4 of https count
|
// 分离 HTTPS 和 HTTP 链接,HTTP 最多占 HTTPS 的 1/4
|
||||||
var httpsURLs, httpURLs []string
|
var httpsURLs, httpURLs []string
|
||||||
for _, s := range selected {
|
for _, s := range selected {
|
||||||
if strings.HasPrefix(s, "https://") {
|
if strings.HasPrefix(s, "https://") {
|
||||||
@@ -321,7 +347,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
|||||||
httpURLs = sampleStrings(httpURLs, maxHTTP)
|
httpURLs = sampleStrings(httpURLs, maxHTTP)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Separate prosperous / non-prosperous
|
// 分离繁荣(高反向链接)域名和普通域名,按比例控制繁荣 URL 占比
|
||||||
var prosperURLs, otherURLs []string
|
var prosperURLs, otherURLs []string
|
||||||
for _, u := range append(httpsURLs, httpURLs...) {
|
for _, u := range append(httpsURLs, httpURLs...) {
|
||||||
if c.prosperMap[netloc(u)] > 0 {
|
if c.prosperMap[netloc(u)] > 0 {
|
||||||
@@ -330,6 +356,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
|||||||
otherURLs = append(otherURLs, u)
|
otherURLs = append(otherURLs, u)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 根据目标繁荣占比计算普通 URL 应保留数量
|
||||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
||||||
if len(otherURLs) > n {
|
if len(otherURLs) > n {
|
||||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||||
@@ -338,12 +365,14 @@ func (c *Crawler) schedule(links []URLWeight) []string {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 合并并随机打乱(使繁荣 URL 和普通 URL 混合)
|
||||||
result := append(prosperURLs, otherURLs...)
|
result := append(prosperURLs, otherURLs...)
|
||||||
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// scoreURL computes the scheduling priority for a URL.
|
// scoreURL 计算单个 URL 的调度优先级分数。
|
||||||
|
// 综合考虑:中文语种权重、域名访问历史衰减、网站质量评分、繁荣值、URL 本身质量。
|
||||||
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
|
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
|
||||||
host := netloc(lw.URL)
|
host := netloc(lw.URL)
|
||||||
super := superNetloc(lw.URL)
|
super := superNetloc(lw.URL)
|
||||||
@@ -353,7 +382,7 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
|
|||||||
info = &storage.SiteInfo{}
|
info = &storage.SiteInfo{}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Chinese-ness
|
// 中文倾向性:该网站中文内容占比
|
||||||
var chineseness float64 = 0.5
|
var chineseness float64 = 0.5
|
||||||
if len(info.Languages) > 0 {
|
if len(info.Languages) > 0 {
|
||||||
total := 0.0
|
total := 0.0
|
||||||
@@ -365,12 +394,13 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Interest decay based on visit count
|
// 兴趣衰减:基于访问次数的指数衰减,繁荣域名可访问更多次
|
||||||
prosper := math.Min(62, c.prosperMap[host])
|
prosper := math.Min(62, c.prosperMap[host])
|
||||||
limit := prosper*500 + 50
|
limit := prosper*500 + 50
|
||||||
b := math.Pow(0.1, 1/limit)
|
b := math.Pow(0.1, 1/limit)
|
||||||
interest := math.Pow(b, float64(info.VisitCount))
|
interest := math.Pow(b, float64(info.VisitCount))
|
||||||
|
|
||||||
|
// 同理对顶级域名计算衰减(二级域名不够用时看顶级域名)
|
||||||
var interest2 float64 = 1.0
|
var interest2 float64 = 1.0
|
||||||
if super != host {
|
if super != host {
|
||||||
superInfo := siteCache[super]
|
superInfo := siteCache[super]
|
||||||
@@ -381,23 +411,28 @@ func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo)
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 网站质量评分
|
||||||
quality := 1.0
|
quality := 1.0
|
||||||
if info.Quality != nil {
|
if info.Quality != nil {
|
||||||
quality = *info.Quality
|
quality = *info.Quality
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 繁荣值加分(log 变换平滑)
|
||||||
prosperity := prosper
|
prosperity := prosper
|
||||||
if prosperity > 0 {
|
if prosperity > 0 {
|
||||||
prosperity += 0.5
|
prosperity += 0.5
|
||||||
}
|
}
|
||||||
prosperity = math.Log2(2+prosperity) + 1
|
prosperity = math.Log2(2+prosperity) + 1
|
||||||
|
|
||||||
|
// URL 本身的质量惩罚(过长、路径过深、使用 .php/.htm 等)
|
||||||
bad := badURL(lw.URL)
|
bad := badURL(lw.URL)
|
||||||
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
|
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- helper functions ----
|
// ---- 辅助函数 ----
|
||||||
|
|
||||||
|
// netloc 从原始 URL 字符串提取主机名(不含路径)。
|
||||||
|
// 支持 http:// 和 https:// 前缀,自动处理 URL 解析异常。
|
||||||
func netloc(rawURL string) string {
|
func netloc(rawURL string) string {
|
||||||
parts := strings.SplitN(rawURL, "/", 4)
|
parts := strings.SplitN(rawURL, "/", 4)
|
||||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||||
@@ -410,7 +445,8 @@ func netloc(rawURL string) string {
|
|||||||
return u.Host
|
return u.Host
|
||||||
}
|
}
|
||||||
|
|
||||||
// superNetloc returns "domain.tld" (strips subdomains).
|
// superNetloc 返回顶级域名(去除子域名),例如 "www.example.com" → "example.com"。
|
||||||
|
// 用于识别跨子域名但同主站的情况。
|
||||||
func superNetloc(rawURL string) string {
|
func superNetloc(rawURL string) string {
|
||||||
host := netloc(rawURL)
|
host := netloc(rawURL)
|
||||||
parts := strings.Split(host, ".")
|
parts := strings.Split(host, ".")
|
||||||
@@ -420,20 +456,26 @@ func superNetloc(rawURL string) string {
|
|||||||
return host
|
return host
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// badURL 返回 URL 的"劣质"评分(0~0.9),基于长度、路径深度、文件扩展名等特征。
|
||||||
func badURL(u string) float64 {
|
func badURL(u string) float64 {
|
||||||
|
// URL 过长惩罚
|
||||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||||
|
// 使用 .htm/.php 等动态页面惩罚
|
||||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||||
s += (1 - s) * 0.3
|
s += (1 - s) * 0.3
|
||||||
}
|
}
|
||||||
|
// 路径层级过深惩罚(超过 2 层斜杠)
|
||||||
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
||||||
s += (1 - s) * 0.1
|
s += (1 - s) * 0.1
|
||||||
}
|
}
|
||||||
|
// 极短 URL 或协议后冒号(如 ftp:)惩罚
|
||||||
if len(u) < 5 || u[4] == ':' {
|
if len(u) < 5 || u[4] == ':' {
|
||||||
s += (1 - s) * 0.3
|
s += (1 - s) * 0.3
|
||||||
}
|
}
|
||||||
return math.Min(s, 0.9)
|
return math.Min(s, 0.9)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// truncate 将字符串截断到最多 n 个字符。
|
||||||
func truncate(s string, n int) string {
|
func truncate(s string, n int) string {
|
||||||
if len(s) <= n {
|
if len(s) <= n {
|
||||||
return s
|
return s
|
||||||
@@ -441,6 +483,7 @@ func truncate(s string, n int) string {
|
|||||||
return s[:n]
|
return s[:n]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sampleStrings 从字符串切片中随机不重复抽取 n 条。
|
||||||
func sampleStrings(s []string, n int) []string {
|
func sampleStrings(s []string, n int) []string {
|
||||||
if len(s) <= n {
|
if len(s) <= n {
|
||||||
return s
|
return s
|
||||||
@@ -453,6 +496,7 @@ func sampleStrings(s []string, n int) []string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sampleURLWeights 与 sampleStrings 相同,但处理 URLWeight 切片。
|
||||||
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
||||||
if len(s) <= n {
|
if len(s) <= n {
|
||||||
return s
|
return s
|
||||||
@@ -465,11 +509,14 @@ func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// scoredURL 内部用结构体,存储 URL 和对应调度分数。
|
||||||
type scoredURL struct {
|
type scoredURL struct {
|
||||||
url string
|
url string
|
||||||
score float64
|
score float64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// weightedSample 加权随机采样(不放回):从 scoredURL 列表中按权重概率抽取最多 k 条。
|
||||||
|
// 使用累积概率法近似 alias method(适合中等规模数据)。
|
||||||
func weightedSample(items []scoredURL, k int) []string {
|
func weightedSample(items []scoredURL, k int) []string {
|
||||||
if k >= len(items) {
|
if k >= len(items) {
|
||||||
out := make([]string, len(items))
|
out := make([]string, len(items))
|
||||||
@@ -478,7 +525,6 @@ func weightedSample(items []scoredURL, k int) []string {
|
|||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
// Simple weighted sampling without replacement using alias method approximation
|
|
||||||
totalWeight := 0.0
|
totalWeight := 0.0
|
||||||
for _, s := range items {
|
for _, s := range items {
|
||||||
totalWeight += s.score
|
totalWeight += s.score
|
||||||
@@ -486,6 +532,7 @@ func weightedSample(items []scoredURL, k int) []string {
|
|||||||
selected := make(map[int]bool)
|
selected := make(map[int]bool)
|
||||||
out := make([]string, 0, k)
|
out := make([]string, 0, k)
|
||||||
for len(out) < k && len(selected) < len(items) {
|
for len(out) < k && len(selected) < len(items) {
|
||||||
|
// 随机取 [0, totalWeight) 区间的一个点
|
||||||
r := rand.Float64() * totalWeight
|
r := rand.Float64() * totalWeight
|
||||||
cum := 0.0
|
cum := 0.0
|
||||||
for i, s := range items {
|
for i, s := range items {
|
||||||
@@ -496,7 +543,7 @@ func weightedSample(items []scoredURL, k int) []string {
|
|||||||
if cum >= r {
|
if cum >= r {
|
||||||
selected[i] = true
|
selected[i] = true
|
||||||
out = append(out, s.url)
|
out = append(out, s.url)
|
||||||
totalWeight -= s.score
|
totalWeight -= s.score // 被选中后从总权重中移除(不放回)
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -504,7 +551,10 @@ func weightedSample(items []scoredURL, k int) []string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// concentrationFilter 域名集中度过滤。
|
||||||
|
// 按 CrawlFocus 因子限制每个顶级域名被选中的 URL 数量,防止爬取过于集中在少数网站。
|
||||||
func concentrationFilter(urls []string, k float64) []string {
|
func concentrationFilter(urls []string, k float64) []string {
|
||||||
|
// 按顶级域名分组
|
||||||
domainGroups := make(map[string][]string)
|
domainGroups := make(map[string][]string)
|
||||||
shuffled := make([]string, len(urls))
|
shuffled := make([]string, len(urls))
|
||||||
copy(shuffled, urls)
|
copy(shuffled, urls)
|
||||||
@@ -515,13 +565,14 @@ func concentrationFilter(urls []string, k float64) []string {
|
|||||||
domainGroups[d] = append(domainGroups[d], u)
|
domainGroups[d] = append(domainGroups[d], u)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 计算每组保留上限:域名规模越大允许越多,但按 k 次幂压制
|
||||||
limit := 10
|
limit := 10
|
||||||
if len(domainGroups) > 1 {
|
if len(domainGroups) > 1 {
|
||||||
sizes := make([]int, 0, len(domainGroups))
|
sizes := make([]int, 0, len(domainGroups))
|
||||||
for _, g := range domainGroups {
|
for _, g := range domainGroups {
|
||||||
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
|
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
|
||||||
}
|
}
|
||||||
// sort sizes ascending, drop last (largest)
|
// 升序排列,去除最大一项,用其余项总和的 60% 作为全局上限
|
||||||
for i := 0; i < len(sizes)-1; i++ {
|
for i := 0; i < len(sizes)-1; i++ {
|
||||||
for j := i + 1; j < len(sizes)-1; j++ {
|
for j := i + 1; j < len(sizes)-1; j++ {
|
||||||
if sizes[j] < sizes[i] {
|
if sizes[j] < sizes[i] {
|
||||||
@@ -536,6 +587,7 @@ func concentrationFilter(urls []string, k float64) []string {
|
|||||||
limit = max(10, int(float64(total)*0.6))
|
limit = max(10, int(float64(total)*0.6))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 从每组中按计算的上限采样
|
||||||
var result []string
|
var result []string
|
||||||
for _, g := range domainGroups {
|
for _, g := range domainGroups {
|
||||||
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
|
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
|
||||||
@@ -548,6 +600,7 @@ func concentrationFilter(urls []string, k float64) []string {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// truncateMap 将 map 裁剪到最多 n 条(取前 n 条,无特定顺序)。
|
||||||
func truncateMap(m map[string]string, n int) map[string]string {
|
func truncateMap(m map[string]string, n int) map[string]string {
|
||||||
if len(m) <= n {
|
if len(m) <= n {
|
||||||
return m
|
return m
|
||||||
@@ -564,6 +617,7 @@ func truncateMap(m map[string]string, n int) map[string]string {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// min 返回两个整数中的较小值。
|
||||||
func min(a, b int) int {
|
func min(a, b int) int {
|
||||||
if a < b {
|
if a < b {
|
||||||
return a
|
return a
|
||||||
@@ -571,6 +625,7 @@ func min(a, b int) int {
|
|||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// max 返回两个整数中的较大值。
|
||||||
func max(a, b int) int {
|
func max(a, b int) int {
|
||||||
if a > b {
|
if a > b {
|
||||||
return a
|
return a
|
||||||
@@ -578,7 +633,7 @@ func max(a, b int) int {
|
|||||||
return b
|
return b
|
||||||
}
|
}
|
||||||
|
|
||||||
// Expose Stats for monitoring.
|
// GetStats 返回当前爬虫统计快照(用于监控)。
|
||||||
func (c *Crawler) GetStats() Stats {
|
func (c *Crawler) GetStats() Stats {
|
||||||
return Stats{
|
return Stats{
|
||||||
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
|
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
|
||||||
|
|||||||
+85
-52
@@ -1,64 +1,71 @@
|
|||||||
// Package crawler implements the HTTP fetching layer with robots.txt compliance,
|
// Package crawler implements the HTTP fetching layer with robots.txt compliance,
|
||||||
// per-host rate limiting, redirect tracking, and encoding detection.
|
// per-host rate limiting, redirect tracking, and encoding detection.
|
||||||
|
// crawler 包负责 HTTP 请求层:遵守 robots.txt、主机限流、追踪重定向、自动检测字符集。
|
||||||
package crawler
|
package crawler
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt" // 字符串格式化(构建 robots.txt URL、错误信息)
|
||||||
"io"
|
"io" // IO 接口(读取响应体)
|
||||||
"net/http"
|
"net/http" // HTTP 客户端
|
||||||
"net/url"
|
"net/url" // URL 解析
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
"sync"
|
"sync" // 互斥锁(保护限流表和 robots.txt 缓存)
|
||||||
"time"
|
"time" // 时间(限流间隔计算、robots.txt 缓存过期)
|
||||||
|
|
||||||
"golang.org/x/net/html/charset"
|
"golang.org/x/net/html/charset" // HTML 字符集自动检测(将各种编码转为 UTF-8)
|
||||||
)
|
)
|
||||||
|
|
||||||
// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
|
// ErrCrawl 表示爬取过程中的预期错误(404、被 robots.txt 禁止、非 HTML 类型等)。
|
||||||
|
// 此类错误由 FetchSafe 静默丢弃(返回 nil, nil)。
|
||||||
type ErrCrawl struct {
|
type ErrCrawl struct {
|
||||||
Msg string
|
Msg string // 错误描述文本
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Error 实现 error 接口,返回错误描述。
|
||||||
func (e *ErrCrawl) Error() string { return e.Msg }
|
func (e *ErrCrawl) Error() string { return e.Msg }
|
||||||
|
|
||||||
// FetchResult bundles the result of a successful fetch.
|
// FetchResult 封装一次成功抓取的完整结果。
|
||||||
type FetchResult struct {
|
type FetchResult struct {
|
||||||
Body string // decoded HTML body
|
Body string // 解码后的 HTML 正文(UTF-8)
|
||||||
FinalURL string // URL after redirects
|
FinalURL string // 经过所有重定向后的最终 URL
|
||||||
Redirects map[string]string // permanent redirects: from → to
|
Redirects map[string]string // 永久重定向(301/308)映射:原始 URL → 最终 URL
|
||||||
ServerType string
|
ServerType string // HTTP Server 响应头(如 "nginx/1.18")
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
|
// Fetcher 是一个可复用的 HTTP 客户端,内置 robots.txt 合规检查和按主机限流。
|
||||||
type Fetcher struct {
|
type Fetcher struct {
|
||||||
client *http.Client
|
client *http.Client // HTTP 客户端(包含重定向和超时控制)
|
||||||
userAgent string
|
userAgent string // HTTP 请求的 User-Agent 头
|
||||||
cooldown time.Duration
|
cooldown time.Duration // 同一主机相邻两次请求的最小间隔
|
||||||
|
|
||||||
rateMu sync.Mutex
|
rateMu sync.Mutex // 保护 lastHit 限流表的互斥锁
|
||||||
lastHit map[string]time.Time // host → last request time
|
lastHit map[string]time.Time // 主机名 → 上次请求时间(用于计算限流等待)
|
||||||
|
|
||||||
robotsMu sync.Mutex
|
robotsMu sync.Mutex // 保护 robots 缓存的互斥锁
|
||||||
robots map[string]*robotsEntry // host → parsed robots
|
robots map[string]*robotsEntry // 主机名 → 该主机的 robots.txt 解析结果(含缓存时间)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// robotsEntry 缓存单台主机的 robots.txt 解析结果。
|
||||||
type robotsEntry struct {
|
type robotsEntry struct {
|
||||||
rules []robotsRule
|
rules []robotsRule // 解析后的规则列表
|
||||||
fetchedAt time.Time
|
fetchedAt time.Time // 缓存时间(用于判断是否过期,24h 后重新抓取)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// robotsRule 一条 robots.txt 规则,对应一个 User-Agent 块。
|
||||||
type robotsRule struct {
|
type robotsRule struct {
|
||||||
userAgent string
|
userAgent string // 适用的爬虫名称("*" 表示全部)
|
||||||
disallow []string
|
disallow []string // Disallow 路径列表
|
||||||
allow []string
|
allow []string // Allow 路径列表(优先于 disallow)
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
|
// NewFetcher 创建一个新的 Fetcher 实例。
|
||||||
|
// userAgent:发出的 HTTP 请求的 User-Agent;cooldown:同一主机相邻请求的最小间隔。
|
||||||
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
|
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
|
||||||
return &Fetcher{
|
return &Fetcher{
|
||||||
client: &http.Client{
|
client: &http.Client{
|
||||||
Timeout: 30 * time.Second,
|
Timeout: 30 * time.Second, // 默认单次请求超时 30 秒
|
||||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||||
|
// 跟随重定向最多 10 次,防止重定向循环
|
||||||
if len(via) >= 10 {
|
if len(via) >= 10 {
|
||||||
return fmt.Errorf("too many redirects")
|
return fmt.Errorf("too many redirects")
|
||||||
}
|
}
|
||||||
@@ -67,34 +74,37 @@ func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
|
|||||||
},
|
},
|
||||||
userAgent: userAgent,
|
userAgent: userAgent,
|
||||||
cooldown: cooldown,
|
cooldown: cooldown,
|
||||||
lastHit: make(map[string]time.Time),
|
lastHit: make(map[string]time.Time), // 限流表初始化
|
||||||
robots: make(map[string]*robotsEntry),
|
robots: make(map[string]*robotsEntry), // robots.txt 缓存初始化
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Fetch fetches url, respecting robots.txt and rate limits.
|
// Fetch 抓取指定 URL,遵守 robots.txt 和主机限流。
|
||||||
// polite=false skips both checks (used by search server snippet fetcher).
|
// polite=false 时跳过 robots.txt 检查和限流(用于搜索服务在线抓摘要)。
|
||||||
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||||
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
||||||
}
|
}
|
||||||
|
|
||||||
// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
|
// FetchSafe 封装 Fetch,在遇到预期爬取错误(404/disallowed/非 HTML)时返回 (nil, nil)。
|
||||||
|
// 调用方无需区分错误类型,直接跳过即可。
|
||||||
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||||
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
||||||
if _, ok := err.(*ErrCrawl); ok {
|
if _, ok := err.(*ErrCrawl); ok {
|
||||||
return nil, nil
|
return nil, nil // 预期错误,静默丢弃
|
||||||
}
|
}
|
||||||
return res, err
|
return res, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchWithHistory does the actual request and populates redirect history.
|
// fetchWithHistory 执行实际 HTTP 请求,追踪永久重定向。
|
||||||
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||||
|
// 解析 URL 提取主机名
|
||||||
parsed, err := url.Parse(rawURL)
|
parsed, err := url.Parse(rawURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
|
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
|
||||||
}
|
}
|
||||||
host := parsed.Host
|
host := parsed.Host
|
||||||
|
|
||||||
|
// polite 模式:先限流,再检查 robots.txt
|
||||||
if polite {
|
if polite {
|
||||||
f.rateLimit(host)
|
f.rateLimit(host)
|
||||||
if !f.robotsAllowed(rawURL, host) {
|
if !f.robotsAllowed(rawURL, host) {
|
||||||
@@ -102,6 +112,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 追踪永久重定向(301/308)
|
||||||
redirects := make(map[string]string)
|
redirects := make(map[string]string)
|
||||||
client := &http.Client{
|
client := &http.Client{
|
||||||
Timeout: timeout,
|
Timeout: timeout,
|
||||||
@@ -109,6 +120,7 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
|||||||
if len(via) >= 10 {
|
if len(via) >= 10 {
|
||||||
return fmt.Errorf("too many redirects")
|
return fmt.Errorf("too many redirects")
|
||||||
}
|
}
|
||||||
|
// 记录永久重定向
|
||||||
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
|
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
|
||||||
from := via[len(via)-1].URL.String()
|
from := via[len(via)-1].URL.String()
|
||||||
to := req.URL.String()
|
to := req.URL.String()
|
||||||
@@ -118,26 +130,32 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
|||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 构造 GET 请求
|
||||||
req, _ := http.NewRequest("GET", rawURL, nil)
|
req, _ := http.NewRequest("GET", rawURL, nil)
|
||||||
req.Header.Set("User-Agent", f.userAgent)
|
req.Header.Set("User-Agent", f.userAgent)
|
||||||
|
|
||||||
|
// 发送请求
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close() // 读取完毕后关闭响应体
|
||||||
|
|
||||||
|
// 检查 HTTP 状态码
|
||||||
if resp.StatusCode == 404 {
|
if resp.StatusCode == 404 {
|
||||||
return nil, &ErrCrawl{Msg: "404 not found"}
|
return nil, &ErrCrawl{Msg: "404 not found"}
|
||||||
}
|
}
|
||||||
if resp.StatusCode >= 400 {
|
if resp.StatusCode >= 400 {
|
||||||
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
|
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 检查 Content-Type,必须是 HTML 才继续
|
||||||
ct := resp.Header.Get("Content-Type")
|
ct := resp.Header.Get("Content-Type")
|
||||||
if !strings.Contains(ct, "text/html") {
|
if !strings.Contains(ct, "text/html") {
|
||||||
return nil, &ErrCrawl{Msg: "not html: " + ct}
|
return nil, &ErrCrawl{Msg: "not html: " + ct}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 解码响应体(自动检测字符集转为 UTF-8)
|
||||||
body, err := decodeBody(resp.Body, ct, sizeLimit)
|
body, err := decodeBody(resp.Body, ct, sizeLimit)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
@@ -145,19 +163,20 @@ func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Dura
|
|||||||
|
|
||||||
return &FetchResult{
|
return &FetchResult{
|
||||||
Body: body,
|
Body: body,
|
||||||
FinalURL: resp.Request.URL.String(),
|
FinalURL: resp.Request.URL.String(), // 重定向后的最终 URL
|
||||||
Redirects: redirects,
|
Redirects: redirects,
|
||||||
ServerType: resp.Header.Get("Server"),
|
ServerType: resp.Header.Get("Server"),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// rateLimit sleeps if the last request to host was too recent.
|
// rateLimit 检查并强制执行主机限流:若距上次请求不足 cooldown 秒则 sleep 等待。
|
||||||
func (f *Fetcher) rateLimit(host string) {
|
func (f *Fetcher) rateLimit(host string) {
|
||||||
f.rateMu.Lock()
|
f.rateMu.Lock()
|
||||||
last, ok := f.lastHit[host]
|
last, ok := f.lastHit[host]
|
||||||
now := time.Now()
|
now := time.Now()
|
||||||
f.lastHit[host] = now
|
f.lastHit[host] = now
|
||||||
// Periodically prune the map
|
|
||||||
|
// 限流表超过 10000 条时清理两倍 cooldown 时间之前的过期项,防止内存泄漏
|
||||||
if len(f.lastHit) > 10000 {
|
if len(f.lastHit) > 10000 {
|
||||||
cutoff := now.Add(-f.cooldown * 2)
|
cutoff := now.Add(-f.cooldown * 2)
|
||||||
for k, v := range f.lastHit {
|
for k, v := range f.lastHit {
|
||||||
@@ -168,6 +187,7 @@ func (f *Fetcher) rateLimit(host string) {
|
|||||||
}
|
}
|
||||||
f.rateMu.Unlock()
|
f.rateMu.Unlock()
|
||||||
|
|
||||||
|
// 计算需要等待的时间
|
||||||
if ok {
|
if ok {
|
||||||
elapsed := now.Sub(last)
|
elapsed := now.Sub(last)
|
||||||
if elapsed < f.cooldown {
|
if elapsed < f.cooldown {
|
||||||
@@ -176,12 +196,14 @@ func (f *Fetcher) rateLimit(host string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// robotsAllowed returns true if rawURL is crawlable.
|
// robotsAllowed 根据 robots.txt 规则判断某 URL 是否允许爬取。
|
||||||
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
||||||
|
// 尝试从缓存读取(加锁保护)
|
||||||
f.robotsMu.Lock()
|
f.robotsMu.Lock()
|
||||||
entry, ok := f.robots[host]
|
entry, ok := f.robots[host]
|
||||||
f.robotsMu.Unlock()
|
f.robotsMu.Unlock()
|
||||||
|
|
||||||
|
// 缓存不存在或已过期(超过 24 小时)则重新抓取并解析
|
||||||
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
|
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
|
||||||
entry = f.fetchRobots(host, rawURL)
|
entry = f.fetchRobots(host, rawURL)
|
||||||
f.robotsMu.Lock()
|
f.robotsMu.Lock()
|
||||||
@@ -189,6 +211,7 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
|||||||
f.robotsMu.Unlock()
|
f.robotsMu.Unlock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 解析 URL 路径
|
||||||
parsed, err := url.Parse(rawURL)
|
parsed, err := url.Parse(rawURL)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return false
|
return false
|
||||||
@@ -198,43 +221,47 @@ func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
|||||||
path = "/"
|
path = "/"
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 遍历所有规则,找到适用的 User-Agent
|
||||||
for _, rule := range entry.rules {
|
for _, rule := range entry.rules {
|
||||||
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
|
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
// Check allow first (higher priority)
|
// Allow 优先检查(更高优先级)
|
||||||
for _, a := range rule.allow {
|
for _, a := range rule.allow {
|
||||||
if strings.HasPrefix(path, a) {
|
if strings.HasPrefix(path, a) {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 再检查 Disallow
|
||||||
for _, dis := range rule.disallow {
|
for _, dis := range rule.disallow {
|
||||||
if dis != "" && strings.HasPrefix(path, dis) {
|
if dis != "" && strings.HasPrefix(path, dis) {
|
||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return true
|
return true // 默认允许
|
||||||
}
|
}
|
||||||
|
|
||||||
// fetchRobots downloads and parses robots.txt for a host.
|
// fetchRobots 抓取并解析某主机的 robots.txt 文件。
|
||||||
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
|
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
|
||||||
entry := &robotsEntry{fetchedAt: time.Now()}
|
entry := &robotsEntry{fetchedAt: time.Now()} // 初始化空条目(抓取失败时默认允许全部)
|
||||||
scheme := "https"
|
scheme := "https"
|
||||||
if strings.HasPrefix(exampleURL, "http://") {
|
if strings.HasPrefix(exampleURL, "http://") {
|
||||||
scheme = "http"
|
scheme = "http"
|
||||||
}
|
}
|
||||||
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
||||||
|
|
||||||
|
// robots.txt 单独请求,超时 5 秒
|
||||||
client := &http.Client{Timeout: 5 * time.Second}
|
client := &http.Client{Timeout: 5 * time.Second}
|
||||||
req, _ := http.NewRequest("GET", robotsURL, nil)
|
req, _ := http.NewRequest("GET", robotsURL, nil)
|
||||||
req.Header.Set("User-Agent", f.userAgent)
|
req.Header.Set("User-Agent", f.userAgent)
|
||||||
resp, err := client.Do(req)
|
resp, err := client.Do(req)
|
||||||
if err != nil || resp.StatusCode != 200 {
|
if err != nil || resp.StatusCode != 200 {
|
||||||
return entry // allow all if robots.txt unavailable
|
return entry // robots.txt 不可用时默认允许爬取
|
||||||
}
|
}
|
||||||
defer resp.Body.Close()
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
// 最多读取 256KB(大部分 robots.txt 远小于此大小)
|
||||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
|
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return entry
|
return entry
|
||||||
@@ -243,16 +270,19 @@ func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
|
|||||||
return entry
|
return entry
|
||||||
}
|
}
|
||||||
|
|
||||||
// parseRobots is a minimal robots.txt parser.
|
// parseRobots 最小化 robots.txt 解析器。
|
||||||
|
// 支持 User-agent、Disallow、Allow 三种指令,忽略注释和空行。
|
||||||
func parseRobots(content string) []robotsRule {
|
func parseRobots(content string) []robotsRule {
|
||||||
var rules []robotsRule
|
var rules []robotsRule
|
||||||
var current *robotsRule
|
var current *robotsRule
|
||||||
for _, line := range strings.Split(content, "\n") {
|
for _, line := range strings.Split(content, "\n") {
|
||||||
line = strings.TrimSpace(line)
|
line = strings.TrimSpace(line)
|
||||||
|
// 去除行内注释
|
||||||
if idx := strings.Index(line, "#"); idx >= 0 {
|
if idx := strings.Index(line, "#"); idx >= 0 {
|
||||||
line = line[:idx]
|
line = line[:idx]
|
||||||
}
|
}
|
||||||
if line == "" {
|
if line == "" {
|
||||||
|
// 空行结束当前块
|
||||||
if current != nil {
|
if current != nil {
|
||||||
rules = append(rules, *current)
|
rules = append(rules, *current)
|
||||||
current = nil
|
current = nil
|
||||||
@@ -267,6 +297,7 @@ func parseRobots(content string) []robotsRule {
|
|||||||
val := strings.TrimSpace(parts[1])
|
val := strings.TrimSpace(parts[1])
|
||||||
switch key {
|
switch key {
|
||||||
case "user-agent":
|
case "user-agent":
|
||||||
|
// 新建一个 User-Agent 块
|
||||||
if current == nil {
|
if current == nil {
|
||||||
current = &robotsRule{userAgent: val}
|
current = &robotsRule{userAgent: val}
|
||||||
} else {
|
} else {
|
||||||
@@ -282,23 +313,25 @@ func parseRobots(content string) []robotsRule {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 最后一个块
|
||||||
if current != nil {
|
if current != nil {
|
||||||
rules = append(rules, *current)
|
rules = append(rules, *current)
|
||||||
}
|
}
|
||||||
return rules
|
return rules
|
||||||
}
|
}
|
||||||
|
|
||||||
// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
|
// decodeBody 从响应体读取最多 sizeLimit 字节,自动检测字符集并转为 UTF-8 字符串。
|
||||||
|
// sizeLimit <= 0 时不限制大小。
|
||||||
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
|
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
|
||||||
var reader io.Reader = r
|
var reader io.Reader = r
|
||||||
if sizeLimit > 0 {
|
if sizeLimit > 0 {
|
||||||
reader = io.LimitReader(r, int64(sizeLimit))
|
reader = io.LimitReader(r, int64(sizeLimit)) // 限制读取字节数,防止大文件撑爆内存
|
||||||
}
|
}
|
||||||
|
|
||||||
// Use golang.org/x/net/html/charset for auto-detection
|
// 使用 golang.org/x/net/html/charset 自动检测 HTML 编码并转为 UTF-8
|
||||||
utf8Reader, err := charset.NewReader(reader, contentType)
|
utf8Reader, err := charset.NewReader(reader, contentType)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
// Fall back to reading raw and hoping for UTF-8
|
// 备选方案:直接以 UTF-8 读取(可能乱码但不崩溃)
|
||||||
data, readErr := io.ReadAll(reader)
|
data, readErr := io.ReadAll(reader)
|
||||||
if readErr != nil {
|
if readErr != nil {
|
||||||
return "", readErr
|
return "", readErr
|
||||||
|
|||||||
+68
-52
@@ -1,39 +1,39 @@
|
|||||||
// Package harvester implements the index-writing server (port 5000).
|
// Package harvester implements the index-writing server (port 5000).
|
||||||
|
// 收获服务器包:接收爬虫发送的关键词索引数据,批量写入 bbolt 持久化存储。
|
||||||
//
|
//
|
||||||
// It receives (url, keywords) payloads from the crawler, accumulates them in
|
// 工作流程:爬虫每抓取一个页面,将 (URL, 关键词列表) 通过 HTTP POST 发送到本服务;
|
||||||
// memory, then flushes to the persistent inverted index when the in-memory
|
// 本服务先将数据积累在内存中,当内存中索引条目数量超过阈值时,批量合并到磁盘索引。
|
||||||
// row count exceeds the configured threshold.
|
|
||||||
package harvester
|
package harvester
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json" // JSON 反序列化(解析爬虫请求)
|
||||||
"log"
|
"log" // 日志输出
|
||||||
"math/rand"
|
"math/rand" // 随机数(打乱合并顺序、触发概率性操作)
|
||||||
"net/http"
|
"net/http" // HTTP 服务端
|
||||||
"strings"
|
"strings" // 字符串操作(URL 清洗)
|
||||||
"sync"
|
"sync" // 互斥锁(保护内存索引、防止并发刷盘)
|
||||||
"sync/atomic"
|
"sync/atomic" // 原子操作(计数器)
|
||||||
|
|
||||||
"sese-engine/config"
|
"sese-engine/config" // 全局配置(刷盘阈值、URL 上限)
|
||||||
"sese-engine/info"
|
"sese-engine/info" // info 服务(查询繁荣分数用于裁剪)
|
||||||
"sese-engine/storage"
|
"sese-engine/storage" // 持久化存储
|
||||||
)
|
)
|
||||||
|
|
||||||
// Server is the harvester HTTP server.
|
// Server 是收获 HTTP 服务器,负责接收爬虫数据、内存聚合、批量写入。
|
||||||
type Server struct {
|
type Server struct {
|
||||||
db *storage.DB
|
db *storage.DB
|
||||||
|
|
||||||
// in-memory accumulator: keyword → [(weight, url)]
|
// 内存索引聚合器:关键词 → 该词关联的 [权重, URL] 条目列表
|
||||||
mem map[string][]storage.IndexEntry
|
mem map[string][]storage.IndexEntry
|
||||||
memMu sync.Mutex
|
memMu sync.Mutex // 保护内存索引的并发写入
|
||||||
|
|
||||||
rowCount int64 // approximate total in-memory rows
|
rowCount int64 // 内存中累计的索引条目总数(用于触发刷盘)
|
||||||
flushMu sync.Mutex // only one flush at a time
|
flushMu sync.Mutex // 确保同一时刻只有一个 flush 在执行
|
||||||
|
|
||||||
infoSvc *info.Service
|
infoSvc *info.Service // info 服务:用于查询繁荣分数来决定索引裁剪优先级
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a harvester Server.
|
// New 创建一个 harvester Server 实例。
|
||||||
func New(db *storage.DB, infoSvc *info.Service) *Server {
|
func New(db *storage.DB, infoSvc *info.Service) *Server {
|
||||||
return &Server{
|
return &Server{
|
||||||
db: db,
|
db: db,
|
||||||
@@ -42,22 +42,23 @@ func New(db *storage.DB, infoSvc *info.Service) *Server {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ingestPayload is the JSON body sent by the crawler.
|
// ingestPayload 是爬虫发送的 JSON 请求体结构。
|
||||||
type ingestPayload struct {
|
type ingestPayload struct {
|
||||||
URL string `json:"url"`
|
URL string `json:"url"` // 被索引页面的最终 URL
|
||||||
Keywords []struct {
|
Keywords []struct {
|
||||||
Word string `json:"word"`
|
Word string `json:"word"` // 关键词
|
||||||
Weight float32 `json:"weight"`
|
Weight float32 `json:"weight"` // 该 URL 在该词下的权重
|
||||||
} `json:"keywords"`
|
} `json:"keywords"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handler returns the http.Handler for the harvester.
|
// Handler 返回 HTTP 路由处理器。
|
||||||
func (s *Server) Handler() http.Handler {
|
func (s *Server) Handler() http.Handler {
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
mux.HandleFunc("/l", s.handleIngest)
|
mux.HandleFunc("/l", s.handleIngest) // /l 端点:接收爬虫数据
|
||||||
return mux
|
return mux
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// handleIngest 处理爬虫发来的 POST 请求,将关键词数据写入内存索引。
|
||||||
func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||||
if r.Method != http.MethodPost {
|
if r.Method != http.MethodPost {
|
||||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||||
@@ -69,7 +70,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
|||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
// Sanitise URL
|
// 清洗 URL:去除换行符(防止注入)
|
||||||
payload.URL = strings.ReplaceAll(payload.URL, "\n", "")
|
payload.URL = strings.ReplaceAll(payload.URL, "\n", "")
|
||||||
if payload.URL == "" {
|
if payload.URL == "" {
|
||||||
http.Error(w, "empty url", http.StatusBadRequest)
|
http.Error(w, "empty url", http.StatusBadRequest)
|
||||||
@@ -81,7 +82,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
|||||||
key := kw.Word
|
key := kw.Word
|
||||||
entries := s.mem[key]
|
entries := s.mem[key]
|
||||||
|
|
||||||
// Threshold-based early discard
|
// 阈值提前过滤:若该词已有大量条目,则只接受权重足够高的新条目
|
||||||
if len(entries) > 15 {
|
if len(entries) > 15 {
|
||||||
low := s.lowThreshold(key)
|
low := s.lowThreshold(key)
|
||||||
if float64(kw.Weight) < low {
|
if float64(kw.Weight) < low {
|
||||||
@@ -96,7 +97,7 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
s.memMu.Unlock()
|
s.memMu.Unlock()
|
||||||
|
|
||||||
// Check if we should flush
|
// 当内存条目数超过阈值时,异步触发刷盘
|
||||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
||||||
go s.flush()
|
go s.flush()
|
||||||
}
|
}
|
||||||
@@ -104,28 +105,32 @@ func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
|||||||
w.Write([]byte("ok"))
|
w.Write([]byte("ok"))
|
||||||
}
|
}
|
||||||
|
|
||||||
// lowThreshold returns the minimum weight needed to enter the index for key.
|
// lowThreshold 返回某关键词在已有大量条目时,新条目所需的最低权重阈值。
|
||||||
|
// 计算方式:找到磁盘上该词第 MaxURLsPerKey 高权重值,取其 5% 作为阈值。
|
||||||
func (s *Server) lowThreshold(key string) float64 {
|
func (s *Server) lowThreshold(key string) float64 {
|
||||||
existing, _ := s.db.GetIndex(key)
|
existing, _ := s.db.GetIndex(key)
|
||||||
if len(existing) < config.MaxURLsPerKey {
|
if len(existing) < config.MaxURLsPerKey {
|
||||||
return -1
|
return -1 // 未达上限,所有条目都接受
|
||||||
}
|
}
|
||||||
// Find the config.MaxURLsPerKey-th highest weight
|
// 收集所有权重值
|
||||||
weights := make([]float64, len(existing))
|
weights := make([]float64, len(existing))
|
||||||
for i, e := range existing {
|
for i, e := range existing {
|
||||||
weights[i] = float64(e.Weight)
|
weights[i] = float64(e.Weight)
|
||||||
}
|
}
|
||||||
// Partial sort: find threshold at position MaxURLsPerKey-1
|
// 找第 MaxURLsPerKey-1 大的值(即准入门槛)
|
||||||
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
||||||
}
|
}
|
||||||
|
|
||||||
// flush merges the in-memory accumulator into the persistent index.
|
// flush 将内存中的索引批量合并写入磁盘,然后清空内存。
|
||||||
|
// 整个过程:原子快照 → 并行合并 → 批量写入。
|
||||||
func (s *Server) flush() {
|
func (s *Server) flush() {
|
||||||
|
// TryLock:若已有其他 flush 在执行则直接退出
|
||||||
if !s.flushMu.TryLock() {
|
if !s.flushMu.TryLock() {
|
||||||
return // another flush is running
|
return
|
||||||
}
|
}
|
||||||
defer s.flushMu.Unlock()
|
defer s.flushMu.Unlock()
|
||||||
|
|
||||||
|
// 原子快照:取出当前内存数据并立即重置
|
||||||
s.memMu.Lock()
|
s.memMu.Lock()
|
||||||
snapshot := s.mem
|
snapshot := s.mem
|
||||||
s.mem = make(map[string][]storage.IndexEntry)
|
s.mem = make(map[string][]storage.IndexEntry)
|
||||||
@@ -134,6 +139,7 @@ func (s *Server) flush() {
|
|||||||
|
|
||||||
log.Printf("[harvester] flushing %d keys", len(snapshot))
|
log.Printf("[harvester] flushing %d keys", len(snapshot))
|
||||||
|
|
||||||
|
// 转换为切片便于处理,打乱顺序防止热点词优先处理导致堆积
|
||||||
items := make([]struct {
|
items := make([]struct {
|
||||||
key string
|
key string
|
||||||
entries []storage.IndexEntry
|
entries []storage.IndexEntry
|
||||||
@@ -146,13 +152,13 @@ func (s *Server) flush() {
|
|||||||
}
|
}
|
||||||
rand.Shuffle(len(items), func(i, j int) { items[i], items[j] = items[j], items[i] })
|
rand.Shuffle(len(items), func(i, j int) { items[i], items[j] = items[j], items[i] })
|
||||||
|
|
||||||
// Parallel merge
|
// 并行合并:每个关键词独立合并到磁盘
|
||||||
type result struct {
|
type result struct {
|
||||||
key string
|
key string
|
||||||
entries []storage.IndexEntry
|
entries []storage.IndexEntry
|
||||||
}
|
}
|
||||||
results := make(chan result, len(items))
|
results := make(chan result, len(items))
|
||||||
sem := make(chan struct{}, 8)
|
sem := make(chan struct{}, 8) // 最多 8 个并发合并协程
|
||||||
|
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
sem <- struct{}{}
|
sem <- struct{}{}
|
||||||
@@ -163,36 +169,39 @@ func (s *Server) flush() {
|
|||||||
}(item.key, item.entries)
|
}(item.key, item.entries)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect
|
// 收集所有合并结果
|
||||||
batch := make(map[string][]storage.IndexEntry, len(items))
|
batch := make(map[string][]storage.IndexEntry, len(items))
|
||||||
for range items {
|
for range items {
|
||||||
r := <-results
|
r := <-results
|
||||||
batch[r.key] = r.entries
|
batch[r.key] = r.entries
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 批量写入 bbolt(一次事务写入所有关键词)
|
||||||
if err := s.db.BatchSetIndex(batch); err != nil {
|
if err := s.db.BatchSetIndex(batch); err != nil {
|
||||||
log.Printf("[harvester] flush write error: %v", err)
|
log.Printf("[harvester] flush write error: %v", err)
|
||||||
}
|
}
|
||||||
log.Printf("[harvester] flush done, %d keys written", len(batch))
|
log.Printf("[harvester] flush done, %d keys written", len(batch))
|
||||||
}
|
}
|
||||||
|
|
||||||
// mergeKey merges new entries with existing index entries for a key.
|
// mergeKey 将新条目和磁盘已有条目合并后返回最终列表。
|
||||||
|
// 包含:去重 → 概率性 URL 归一化去重 → 超限时按繁荣分数裁剪。
|
||||||
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
||||||
existing, _ := s.db.GetIndex(key)
|
existing, _ := s.db.GetIndex(key)
|
||||||
|
|
||||||
// Discard new key if too few URLs
|
// 新关键词:如果条目数过少则丢弃(避免索引质量下降)
|
||||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 合并新旧条目
|
||||||
merged := dedup(append(newEntries, existing...))
|
merged := dedup(append(newEntries, existing...))
|
||||||
|
|
||||||
// Occasional URL normalisation dedup
|
// 2% 概率执行 URL 归一化去重(去除 https/http 重复、尾部斜杠差异等)
|
||||||
if rand.Float64() < 0.02 {
|
if rand.Float64() < 0.02 {
|
||||||
merged = dedupNormalised(merged)
|
merged = dedupNormalised(merged)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Trim if over limit
|
// 超限或 2% 概率触发裁剪:按 (权重 × 繁荣分数) 排序后截断
|
||||||
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||||
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
||||||
}
|
}
|
||||||
@@ -200,8 +209,9 @@ func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage
|
|||||||
return merged
|
return merged
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- helpers ----
|
// ---- 辅助函数 ----
|
||||||
|
|
||||||
|
// dedup 按 URL 完全匹配去重。
|
||||||
func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
|
func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
|
||||||
seen := make(map[string]bool, len(entries))
|
seen := make(map[string]bool, len(entries))
|
||||||
out := make([]storage.IndexEntry, 0, len(entries))
|
out := make([]storage.IndexEntry, 0, len(entries))
|
||||||
@@ -215,10 +225,12 @@ func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// dedupNormalised 按 URL 归一化去重(去除协议前缀和尾部斜杠后比较)。
|
||||||
|
// 按 URL 长度降序排序后处理:短 URL 优先保留(更可能是规范 URL)。
|
||||||
func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
|
func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
|
||||||
// Sort by URL length descending, then dedup by normalised URL (strip scheme, trailing slash)
|
|
||||||
sorted := make([]storage.IndexEntry, len(entries))
|
sorted := make([]storage.IndexEntry, len(entries))
|
||||||
copy(sorted, entries)
|
copy(sorted, entries)
|
||||||
|
// 降序排列(简单冒泡)
|
||||||
for i := 0; i < len(sorted)-1; i++ {
|
for i := 0; i < len(sorted)-1; i++ {
|
||||||
for j := i + 1; j < len(sorted); j++ {
|
for j := i + 1; j < len(sorted); j++ {
|
||||||
if len(sorted[j].URL) > len(sorted[i].URL) {
|
if len(sorted[j].URL) > len(sorted[i].URL) {
|
||||||
@@ -239,6 +251,7 @@ func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// normaliseURL 归一化 URL:去除协议前缀,尾部斜杠去除。
|
||||||
func normaliseURL(u string) string {
|
func normaliseURL(u string) string {
|
||||||
if strings.HasPrefix(u, "https://") {
|
if strings.HasPrefix(u, "https://") {
|
||||||
u = u[8:]
|
u = u[8:]
|
||||||
@@ -248,9 +261,10 @@ func normaliseURL(u string) string {
|
|||||||
return strings.TrimRight(u, "/")
|
return strings.TrimRight(u, "/")
|
||||||
}
|
}
|
||||||
|
|
||||||
// trim reduces entries to at most limit, keeping at most sameDomainLimit per domain.
|
// trim 将条目列表裁剪到指定上限,同时限制每个域名的最大条目数。
|
||||||
|
// 排序依据:(权重 × (1 + 繁荣分数)),使高权重且高繁荣的 URL 优先保留。
|
||||||
func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomainLimit int) []storage.IndexEntry {
|
func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomainLimit int) []storage.IndexEntry {
|
||||||
// Sort by effective score: weight * (1 + backlink)
|
// 按综合分数降序排列
|
||||||
scored := make([]storage.IndexEntry, len(entries))
|
scored := make([]storage.IndexEntry, len(entries))
|
||||||
copy(scored, entries)
|
copy(scored, entries)
|
||||||
for i := 0; i < len(scored)-1; i++ {
|
for i := 0; i < len(scored)-1; i++ {
|
||||||
@@ -263,7 +277,7 @@ func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomain
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Per-domain cap
|
// 按域名计数,每个域名最多保留 sameDomainLimit 条(首页 URL 不受限制)
|
||||||
domainCount := make(map[string]int)
|
domainCount := make(map[string]int)
|
||||||
out := make([]storage.IndexEntry, 0, limit)
|
out := make([]storage.IndexEntry, 0, limit)
|
||||||
for _, e := range scored {
|
for _, e := range scored {
|
||||||
@@ -272,8 +286,7 @@ func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomain
|
|||||||
host = e.URL
|
host = e.URL
|
||||||
}
|
}
|
||||||
host = strings.ToLower(host)
|
host = strings.ToLower(host)
|
||||||
// Allow homepage URLs regardless of limit
|
isHome := isHomepage(e.URL) // 首页 URL 不受域名数量限制
|
||||||
isHome := isHomepage(e.URL)
|
|
||||||
if !isHome && domainCount[host] >= sameDomainLimit {
|
if !isHome && domainCount[host] >= sameDomainLimit {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@@ -286,12 +299,14 @@ func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomain
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// isHomepage 判断 URL 是否为网站首页(不含路径层级)。
|
||||||
func isHomepage(u string) bool {
|
func isHomepage(u string) bool {
|
||||||
u = strings.TrimPrefix(u, "https://")
|
u = strings.TrimPrefix(u, "https://")
|
||||||
u = strings.TrimPrefix(u, "http://")
|
u = strings.TrimPrefix(u, "http://")
|
||||||
return strings.Count(strings.TrimRight(u, "/"), "/") == 0
|
return strings.Count(strings.TrimRight(u, "/"), "/") == 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// netloc 从 URL 提取主机名(简化版,不依赖 net/url)。
|
||||||
func netloc(rawURL string) string {
|
func netloc(rawURL string) string {
|
||||||
parts := strings.SplitN(rawURL, "/", 4)
|
parts := strings.SplitN(rawURL, "/", 4)
|
||||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||||
@@ -300,14 +315,15 @@ func netloc(rawURL string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
// nthLargest returns the n-th largest value in a slice (0-indexed).
|
// nthLargest 返回切片中第 n 大的值(0-indexed,即找第 n+1 大的值)。
|
||||||
|
// 用于获取准入权重阈值。
|
||||||
func nthLargest(values []float64, n int) float64 {
|
func nthLargest(values []float64, n int) float64 {
|
||||||
if n >= len(values) {
|
if n >= len(values) {
|
||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
cp := make([]float64, len(values))
|
cp := make([]float64, len(values))
|
||||||
copy(cp, values)
|
copy(cp, values)
|
||||||
// Partial sort descending
|
// 部分排序:只需将前 n+1 项排好序
|
||||||
for i := 0; i <= n; i++ {
|
for i := 0; i <= n; i++ {
|
||||||
maxIdx := i
|
maxIdx := i
|
||||||
for j := i + 1; j < len(cp); j++ {
|
for j := i + 1; j < len(cp); j++ {
|
||||||
@@ -320,7 +336,7 @@ func nthLargest(values []float64, n int) float64 {
|
|||||||
return cp[n]
|
return cp[n]
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListenAndServe starts the harvester on the given address.
|
// ListenAndServe 启动收获服务器在指定地址监听。
|
||||||
func (s *Server) ListenAndServe(addr string) error {
|
func (s *Server) ListenAndServe(addr string) error {
|
||||||
log.Printf("[harvester] listening on %s", addr)
|
log.Printf("[harvester] listening on %s", addr)
|
||||||
return http.ListenAndServe(addr, s.Handler())
|
return http.ListenAndServe(addr, s.Handler())
|
||||||
|
|||||||
+48
-30
@@ -1,33 +1,34 @@
|
|||||||
// Package info loads and serves auxiliary data: backlink scores, adjustment
|
// Package info loads and serves auxiliary data: backlink scores, adjustment
|
||||||
// table, and blocked query words.
|
// table, and blocked query words.
|
||||||
|
// info 包负责加载和管理辅助数据:繁荣表(反向链接分数)、调整表(人工权重调整)和屏蔽词表。
|
||||||
package info
|
package info
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json" // JSON 反序列化
|
||||||
"math"
|
"math" // 对数运算(Log2)
|
||||||
"os"
|
"os" // 文件读取
|
||||||
"path/filepath"
|
"path/filepath" // 路径拼接
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
"sync"
|
"sync" // 读写锁
|
||||||
)
|
)
|
||||||
|
|
||||||
// Service loads the prosperity map, adjustment table, and blocked words.
|
// Service 管理繁荣表、调整表和屏蔽词表,并提供只读快照。
|
||||||
type Service struct {
|
type Service struct {
|
||||||
mu sync.RWMutex
|
mu sync.RWMutex
|
||||||
prosperMap map[string]float64 // normalised backlink scores
|
prosperMap map[string]float64 // 繁荣表:域名 → 归一化反向链接分数
|
||||||
adjustTable map[string]float64 // per-domain manual weight adjustments
|
adjustTable map[string]float64 // 调整表:主机名 → 人工权重倍数(默认 1.0)
|
||||||
blockedWords map[string]bool
|
blockedWords map[string]bool // 屏蔽词集合:搜索时直接过滤
|
||||||
storagePath string
|
storagePath string // 存储根目录路径
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates and loads the info service from storagePath.
|
// New 创建并加载 info Service,从 storagePath 目录读取数据文件。
|
||||||
func New(storagePath string) *Service {
|
func New(storagePath string) *Service {
|
||||||
s := &Service{storagePath: storagePath}
|
s := &Service{storagePath: storagePath}
|
||||||
s.Reload()
|
s.Reload()
|
||||||
return s
|
return s
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reload re-reads all data files from disk.
|
// Reload 从磁盘重新加载所有数据文件(支持热更新配置)。
|
||||||
func (s *Service) Reload() {
|
func (s *Service) Reload() {
|
||||||
s.mu.Lock()
|
s.mu.Lock()
|
||||||
defer s.mu.Unlock()
|
defer s.mu.Unlock()
|
||||||
@@ -36,14 +37,16 @@ func (s *Service) Reload() {
|
|||||||
s.blockedWords = loadBlockedWords()
|
s.blockedWords = loadBlockedWords()
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prosper returns the backlink score for a URL (sum of its path components).
|
// Prosper 返回指定 URL 的繁荣分数(对其所有路径段累计计算)。
|
||||||
|
// 分数越高表示该域名越"有价值"(反向链接越多)。
|
||||||
func (s *Service) Prosper(rawURL string) float64 {
|
func (s *Service) Prosper(rawURL string) float64 {
|
||||||
s.mu.RLock()
|
s.mu.RLock()
|
||||||
defer s.mu.RUnlock()
|
defer s.mu.RUnlock()
|
||||||
return prosperFor(rawURL, s.prosperMap)
|
return prosperFor(rawURL, s.prosperMap)
|
||||||
}
|
}
|
||||||
|
|
||||||
// ProsperMap returns the full prosperity map (read-only snapshot).
|
// ProsperMap 返回繁荣表的完整只读快照(深拷贝)。
|
||||||
|
// 供爬虫调度算法使用。
|
||||||
func (s *Service) ProsperMap() map[string]float64 {
|
func (s *Service) ProsperMap() map[string]float64 {
|
||||||
s.mu.RLock()
|
s.mu.RLock()
|
||||||
defer s.mu.RUnlock()
|
defer s.mu.RUnlock()
|
||||||
@@ -54,7 +57,8 @@ func (s *Service) ProsperMap() map[string]float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
// Adjust returns the manual weight multiplier for a hostname (default 1.0).
|
// Adjust 返回指定主机名的人工权重倍数(默认 1.0)。
|
||||||
|
// 允许管理员通过调整表提升或降低某些域名的爬取/搜索优先级。
|
||||||
func (s *Service) Adjust(host string) float64 {
|
func (s *Service) Adjust(host string) float64 {
|
||||||
s.mu.RLock()
|
s.mu.RLock()
|
||||||
defer s.mu.RUnlock()
|
defer s.mu.RUnlock()
|
||||||
@@ -64,17 +68,19 @@ func (s *Service) Adjust(host string) float64 {
|
|||||||
return 1.0
|
return 1.0
|
||||||
}
|
}
|
||||||
|
|
||||||
// IsBlocked returns true if the word is in the blocked list.
|
// IsBlocked 判断某词是否在屏蔽词列表中(搜索时不返回含该词的结果)。
|
||||||
func (s *Service) IsBlocked(word string) bool {
|
func (s *Service) IsBlocked(word string) bool {
|
||||||
s.mu.RLock()
|
s.mu.RLock()
|
||||||
defer s.mu.RUnlock()
|
defer s.mu.RUnlock()
|
||||||
return s.blockedWords[word]
|
return s.blockedWords[word]
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- loaders ----
|
// ---- 数据加载函数 ----
|
||||||
|
|
||||||
|
// backlinkBaseline 繁荣表归一化的基准值(用于将原始链接数映射到固定区间)。
|
||||||
const backlinkBaseline = 200000.0
|
const backlinkBaseline = 200000.0
|
||||||
|
|
||||||
|
// loadProsperMap 从 storage/prosper.json 加载繁荣表,并进行归一化和域名树传播。
|
||||||
func loadProsperMap(storagePath string) map[string]float64 {
|
func loadProsperMap(storagePath string) map[string]float64 {
|
||||||
path := filepath.Join(storagePath, "prosper.json")
|
path := filepath.Join(storagePath, "prosper.json")
|
||||||
f, err := os.Open(path)
|
f, err := os.Open(path)
|
||||||
@@ -89,7 +95,11 @@ func loadProsperMap(storagePath string) map[string]float64 {
|
|||||||
return normalise(raw)
|
return normalise(raw)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// normalise 对繁荣表进行归一化,并执行域名树传播。
|
||||||
|
// 归一化:将所有顶级域名的分数总和缩放到 backlinkBaseline。
|
||||||
|
// 传播:子域名分数向上传播到父域名(父域名分数不低于任何子域名)。
|
||||||
func normalise(d map[string]float64) map[string]float64 {
|
func normalise(d map[string]float64) map[string]float64 {
|
||||||
|
// 计算顶级域名(不含 "/")的分数总和
|
||||||
total := 0.0
|
total := 0.0
|
||||||
for k, v := range d {
|
for k, v := range d {
|
||||||
if !strings.Contains(k, "/") {
|
if !strings.Contains(k, "/") {
|
||||||
@@ -99,12 +109,13 @@ func normalise(d map[string]float64) map[string]float64 {
|
|||||||
if total == 0 {
|
if total == 0 {
|
||||||
return d
|
return d
|
||||||
}
|
}
|
||||||
|
// 按总和归一化
|
||||||
factor := backlinkBaseline / total
|
factor := backlinkBaseline / total
|
||||||
out := make(map[string]float64, len(d))
|
out := make(map[string]float64, len(d))
|
||||||
for k, v := range d {
|
for k, v := range d {
|
||||||
out[k] = v * factor
|
out[k] = v * factor
|
||||||
}
|
}
|
||||||
// Propagate max score up the domain tree
|
// 域名树传播:子域名分数 ≥ 父域名分数
|
||||||
for k, v := range out {
|
for k, v := range out {
|
||||||
now := k
|
now := k
|
||||||
for {
|
for {
|
||||||
@@ -112,9 +123,9 @@ func normalise(d map[string]float64) map[string]float64 {
|
|||||||
if idx < 0 {
|
if idx < 0 {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
now = now[idx+1:]
|
now = now[idx+1:] // 上移一级
|
||||||
if cur, ok := out[now]; ok && cur < v {
|
if cur, ok := out[now]; ok && cur < v {
|
||||||
out[now] = v
|
out[now] = v // 父域名分数不低于子域名
|
||||||
} else if !ok {
|
} else if !ok {
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
@@ -123,8 +134,9 @@ func normalise(d map[string]float64) map[string]float64 {
|
|||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadAdjustTable 从 data/adjust.json 加载人工调整表(主机名 → 权重倍数)。
|
||||||
|
// 文件不存在时返回空 map(所有域名权重为默认 1.0)。
|
||||||
func loadAdjustTable() map[string]float64 {
|
func loadAdjustTable() map[string]float64 {
|
||||||
// Try loading from data/adjust.json — fallback if absent
|
|
||||||
f, err := os.Open(filepath.Join("data", "adjust.json"))
|
f, err := os.Open(filepath.Join("data", "adjust.json"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return map[string]float64{}
|
return map[string]float64{}
|
||||||
@@ -135,6 +147,8 @@ func loadAdjustTable() map[string]float64 {
|
|||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// loadBlockedWords 从 data/blocked_words.json 加载屏蔽词列表。
|
||||||
|
// 文件不存在时返回空集合。
|
||||||
func loadBlockedWords() map[string]bool {
|
func loadBlockedWords() map[string]bool {
|
||||||
f, err := os.Open(filepath.Join("data", "blocked_words.json"))
|
f, err := os.Open(filepath.Join("data", "blocked_words.json"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -150,7 +164,8 @@ func loadBlockedWords() map[string]bool {
|
|||||||
return m
|
return m
|
||||||
}
|
}
|
||||||
|
|
||||||
// prosperFor computes the prosperity score for a URL by decomposing it.
|
// prosperFor 对 URL 按路径段分解查询繁荣表,计算综合繁荣分数。
|
||||||
|
// 分数计算:对每段取 Log2 变换后累加,返回值范围约 [0.1, +∞)。
|
||||||
func prosperFor(rawURL string, pm map[string]float64) float64 {
|
func prosperFor(rawURL string, pm map[string]float64) float64 {
|
||||||
segments := decomposeURL(rawURL)
|
segments := decomposeURL(rawURL)
|
||||||
s := 0.0
|
s := 0.0
|
||||||
@@ -161,7 +176,7 @@ func prosperFor(rawURL string, pm map[string]float64) float64 {
|
|||||||
}
|
}
|
||||||
l := 0.0
|
l := 0.0
|
||||||
if t > 0 {
|
if t > 0 {
|
||||||
l = math.Log2(2+t*2) - 1
|
l = math.Log2(2+t*2) - 1 // Log2(2+2t)-1,t=0 时为 0,随 t 增大而增大
|
||||||
}
|
}
|
||||||
if s == 0 {
|
if s == 0 {
|
||||||
if l == 0 {
|
if l == 0 {
|
||||||
@@ -169,7 +184,7 @@ func prosperFor(rawURL string, pm map[string]float64) float64 {
|
|||||||
}
|
}
|
||||||
s = l
|
s = l
|
||||||
} else {
|
} else {
|
||||||
s = l + math.Log((s-l)/2+1)
|
s = l + math.Log((s-l)/2+1) // 累加并衰减
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if s > 0 {
|
if s > 0 {
|
||||||
@@ -178,7 +193,9 @@ func prosperFor(rawURL string, pm map[string]float64) float64 {
|
|||||||
return 0
|
return 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// decomposeURL yields "domain.tld", "domain.tld/path", "domain.tld/path/sub", ...
|
// decomposeURL 将 URL 分解为递增的路径段。
|
||||||
|
// 例如:"https://zh.wikipedia.org/wiki/Go" → ["zh.wikipedia.org", "zh.wikipedia.org/wiki", "zh.wikipedia.org/wiki/Go"]。
|
||||||
|
// 用于按从泛到精的顺序查繁荣表。
|
||||||
func decomposeURL(rawURL string) []string {
|
func decomposeURL(rawURL string) []string {
|
||||||
u := strings.ToLower(rawURL)
|
u := strings.ToLower(rawURL)
|
||||||
if strings.HasPrefix(u, "https://") {
|
if strings.HasPrefix(u, "https://") {
|
||||||
@@ -188,18 +205,19 @@ func decomposeURL(rawURL string) []string {
|
|||||||
} else {
|
} else {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
u = strings.ReplaceAll(u, "?", "/")
|
u = strings.ReplaceAll(u, "?", "/") // 查询参数转路径
|
||||||
u = strings.ReplaceAll(u, "#", "/")
|
u = strings.ReplaceAll(u, "#", "/") // 锚点转路径
|
||||||
u = strings.TrimRight(u, "/")
|
u = strings.TrimRight(u, "/")
|
||||||
|
// 过滤无效格式
|
||||||
if u == "" || u[0] == '/' || u[0] == '%' || u[0] == ' ' {
|
if u == "" || u[0] == '/' || u[0] == '%' || u[0] == ' ' {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
parts := strings.Split(u, "/")
|
parts := strings.Split(u, "/")
|
||||||
var out []string
|
var out []string
|
||||||
current := parts[0]
|
current := parts[0]
|
||||||
out = append(out, current)
|
out = append(out, current) // 第一段:顶级域名
|
||||||
for _, p := range parts[1:] {
|
for _, p := range parts[1:] {
|
||||||
current = current + "/" + p
|
current = current + "/" + p // 逐步拼接路径段
|
||||||
out = append(out, current)
|
out = append(out, current)
|
||||||
}
|
}
|
||||||
return out
|
return out
|
||||||
|
|||||||
@@ -1,60 +1,65 @@
|
|||||||
// sese-engine — Go rewrite
|
// sese-engine — Go rewrite
|
||||||
|
// Go 版 sese-engine:个人搜索引擎的主入口文件。
|
||||||
//
|
//
|
||||||
// All modules (harvester, search server, crawler, backlink calculator) are
|
// 所有模块(爬虫、收获服务器、搜索服务器、反向链接计算)均作为 goroutine 在同一进程中启动。
|
||||||
// launched as goroutines from this single binary. The binary blocks until
|
// 主线程阻塞等待系统信号(Ctrl-C / SIGTERM),收到后优雅退出。
|
||||||
// interrupted (Ctrl-C / SIGTERM).
|
|
||||||
//
|
//
|
||||||
// Usage:
|
// 运行方式:
|
||||||
//
|
//
|
||||||
// cd golang && go run . [--storage ./savedata] [--entry https://zh.wikipedia.org/]
|
// cd golang && go run . [--storage ./savedata] [--entry https://zh.wikipedia.org/]
|
||||||
package main
|
package main
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"flag"
|
"flag" // 命令行参数解析
|
||||||
"fmt"
|
"fmt" // 格式化(搜索服务端口)
|
||||||
"log"
|
"log" // 日志输出
|
||||||
"os"
|
"os" // 操作系统信号
|
||||||
"os/signal"
|
"os/signal" // 信号捕获
|
||||||
"syscall"
|
"syscall" // 系统调用(SIGTERM)
|
||||||
|
|
||||||
"sese-engine/analyzer"
|
"sese-engine/analyzer" // 文本分析和关键词提取
|
||||||
"sese-engine/backlink"
|
"sese-engine/backlink" // 反向链接(繁荣值)计算
|
||||||
"sese-engine/config"
|
"sese-engine/config" // 全局配置
|
||||||
"sese-engine/crawler"
|
"sese-engine/crawler" // BFS 爬虫
|
||||||
"sese-engine/harvester"
|
"sese-engine/harvester" // 收获服务器(索引写入)
|
||||||
"sese-engine/info"
|
"sese-engine/info" // info 服务(繁荣表、调整表、屏蔽词)
|
||||||
"sese-engine/search"
|
"sese-engine/search" // 搜索服务器
|
||||||
"sese-engine/storage"
|
"sese-engine/storage" // 持久化存储
|
||||||
)
|
)
|
||||||
|
|
||||||
func main() {
|
func main() {
|
||||||
|
// ---- 命令行参数 ----
|
||||||
|
// --storage:存储根目录路径,默认使用 config.StoragePath
|
||||||
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
|
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
|
||||||
|
// --entry:BFS 爬取的起始 URL,默认使用 config.EntryURL(维基百科中文首页)
|
||||||
entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL")
|
entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL")
|
||||||
|
// --stopwords:屏蔽词 JSON 文件路径
|
||||||
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
|
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
|
||||||
flag.Parse()
|
flag.Parse()
|
||||||
|
|
||||||
|
// 设置日志格式:时间戳 + 短文件名
|
||||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||||
log.Printf("sese-engine starting storage=%s entry=%s", *storageDir, *entryURL)
|
log.Printf("sese-engine starting storage=%s entry=%s", *storageDir, *entryURL)
|
||||||
|
|
||||||
// ---- 1. Storage ----
|
// ---- 1. 存储层:打开 bbolt 数据库 ----
|
||||||
db, err := storage.Open(*storageDir)
|
db, err := storage.Open(*storageDir)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("failed to open storage: %v", err)
|
log.Fatalf("failed to open storage: %v", err)
|
||||||
}
|
}
|
||||||
defer db.Close()
|
defer db.Close()
|
||||||
|
|
||||||
// ---- 2. Info service ----
|
// ---- 2. Info 服务:加载繁荣表、调整表和屏蔽词 ----
|
||||||
infoSvc := info.New(*storageDir)
|
infoSvc := info.New(*storageDir)
|
||||||
|
|
||||||
// ---- 3. Analyzer ----
|
// ---- 3. Analyzer:初始化分词器和语言检测器 ----
|
||||||
// modelPath is unused (lingua-go uses built-in language models, no external file needed)
|
// modelPath 参数已废弃(lingua-go 使用内置模型,无需外部文件)
|
||||||
anal, err := analyzer.New("", *stopWords)
|
anal, err := analyzer.New("", *stopWords)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Fatalf("failed to init analyzer: %v", err)
|
log.Fatalf("failed to init analyzer: %v", err)
|
||||||
}
|
}
|
||||||
defer anal.Close()
|
defer anal.Close()
|
||||||
|
|
||||||
// ---- 4. Harvester (index write server on :5000) ----
|
// ---- 4. 收获服务器(:5000):接收爬虫发来的索引数据 ----
|
||||||
harvSrv := harvester.New(db, infoSvc)
|
harvSrv := harvester.New(db, infoSvc)
|
||||||
go func() {
|
go func() {
|
||||||
if err := harvSrv.ListenAndServe(":5000"); err != nil {
|
if err := harvSrv.ListenAndServe(":5000"); err != nil {
|
||||||
@@ -62,7 +67,7 @@ func main() {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// ---- 5. Search server ----
|
// ---- 5. 搜索服务器(默认 :80):对外提供搜索 API ----
|
||||||
searchSrv := search.New(db, infoSvc, anal)
|
searchSrv := search.New(db, infoSvc, anal)
|
||||||
go func() {
|
go func() {
|
||||||
addr := fmt.Sprintf(":%d", config.SearchServerPort)
|
addr := fmt.Sprintf(":%d", config.SearchServerPort)
|
||||||
@@ -71,18 +76,20 @@ func main() {
|
|||||||
}
|
}
|
||||||
}()
|
}()
|
||||||
|
|
||||||
// ---- 6. Backlink calculator (runs every 48 h) ----
|
// ---- 6. 反向链接计算器:每 48 小时运行一次 ----
|
||||||
bl := backlink.New(db, *storageDir)
|
bl := backlink.New(db, *storageDir)
|
||||||
go bl.Run()
|
go bl.Run()
|
||||||
|
|
||||||
// ---- 7. Crawler ----
|
// ---- 7. 爬虫:从入口 URL 开始 BFS 爬取 ----
|
||||||
|
// 从 info 服务获取繁荣表快照,用于调度优先级决策
|
||||||
prosperMap := infoSvc.ProsperMap()
|
prosperMap := infoSvc.ProsperMap()
|
||||||
crawl := crawler.New(db, anal, prosperMap)
|
crawl := crawler.New(db, anal, prosperMap)
|
||||||
go crawl.Run(*entryURL, config.MaxEpoch)
|
go crawl.Run(*entryURL, config.MaxEpoch)
|
||||||
|
|
||||||
log.Println("all modules started — press Ctrl-C to stop")
|
log.Println("all modules started — press Ctrl-C to stop")
|
||||||
|
|
||||||
// ---- Graceful shutdown ----
|
// ---- 优雅退出 ----
|
||||||
|
// 阻塞等待 SIGINT(Ctrl-C)或 SIGTERM 信号
|
||||||
quit := make(chan os.Signal, 1)
|
quit := make(chan os.Signal, 1)
|
||||||
signal.Notify(quit, os.Interrupt, syscall.SIGTERM)
|
signal.Notify(quit, os.Interrupt, syscall.SIGTERM)
|
||||||
<-quit
|
<-quit
|
||||||
|
|||||||
+46
-20
@@ -1,36 +1,46 @@
|
|||||||
// Package parser extracts title, description, text content, and links from HTML.
|
// Package parser extracts title, description, text content, and links from HTML.
|
||||||
|
// parser 包负责 HTML 解析:从网页 HTML 中提取标题、描述、正文和所有超链接。
|
||||||
package parser
|
package parser
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"path"
|
"path" // 路径处理(提取目录、规范化相对路径)
|
||||||
"regexp"
|
"regexp" // 正则表达式(空白字符替换)
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
|
|
||||||
"golang.org/x/net/html"
|
"golang.org/x/net/html" // 标准 HTML 解析器(将 HTML 解析为 DOM 树)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// wsRe 空白字符正则:将任意连续空白字符(空格、换行、制表符等)替换为单个空格。
|
||||||
var wsRe = regexp.MustCompile(`\s+`)
|
var wsRe = regexp.MustCompile(`\s+`)
|
||||||
|
|
||||||
// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
|
// ParseHTML 解析 HTML 文档,返回标题、meta 描述、正文文本和所有超链接列表。
|
||||||
|
// body:原始 HTML 字符串;baseURL:用于解析相对链接的基准 URL。
|
||||||
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
|
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
|
||||||
// Determine base scheme+host
|
// 从 baseURL 提取基准协议和主机(如 "https://example.com")
|
||||||
base := baseFromURL(baseURL)
|
base := baseFromURL(baseURL)
|
||||||
|
// 从 baseURL 提取当前页面路径(如 "/path/page.html")
|
||||||
basePath := pathFromURL(baseURL)
|
basePath := pathFromURL(baseURL)
|
||||||
|
|
||||||
|
// 将 HTML 字符串解析为 DOM 树
|
||||||
doc, err := html.Parse(strings.NewReader(body))
|
doc, err := html.Parse(strings.NewReader(body))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return
|
return // 解析失败返回空
|
||||||
}
|
}
|
||||||
|
|
||||||
var textParts []string
|
var textParts []string // 收集所有正文文本片段
|
||||||
|
|
||||||
|
// 深度优先遍历 DOM 树
|
||||||
var dfs func(n *html.Node)
|
var dfs func(n *html.Node)
|
||||||
dfs = func(n *html.Node) {
|
dfs = func(n *html.Node) {
|
||||||
if n.Type == html.ElementNode {
|
if n.Type == html.ElementNode {
|
||||||
tag := strings.ToLower(n.Data)
|
tag := strings.ToLower(n.Data)
|
||||||
|
|
||||||
|
// 跳过 <script>、<style>、<svg> 等无需解析内容的标签
|
||||||
if tag == "script" || tag == "style" || tag == "svg" {
|
if tag == "script" || tag == "style" || tag == "svg" {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 提取 <meta name="description" content="..."> 标签
|
||||||
if tag == "meta" {
|
if tag == "meta" {
|
||||||
name := ""
|
name := ""
|
||||||
content := ""
|
content := ""
|
||||||
@@ -42,15 +52,20 @@ func ParseHTML(body, baseURL string) (title, description, text string, hrefs []s
|
|||||||
content = a.Val
|
content = a.Val
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// 只取第一个描述
|
||||||
if name == "description" && description == "" {
|
if name == "description" && description == "" {
|
||||||
description = content
|
description = content
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 提取 <a href="..."> 链接
|
||||||
if tag == "a" {
|
if tag == "a" {
|
||||||
href := attrVal(n, "href")
|
href := attrVal(n, "href")
|
||||||
if href != "" {
|
if href != "" {
|
||||||
|
// 去除 URL 中的锚点(#fragment)
|
||||||
href = strings.SplitN(href, "#", 2)[0]
|
href = strings.SplitN(href, "#", 2)[0]
|
||||||
if href != "" {
|
if href != "" {
|
||||||
|
// 解析为绝对 URL(处理相对路径、协议相对路径等)
|
||||||
href = resolveURL(base, basePath, href)
|
href = resolveURL(base, basePath, href)
|
||||||
if href != "" {
|
if href != "" {
|
||||||
hrefs = append(hrefs, href)
|
hrefs = append(hrefs, href)
|
||||||
@@ -60,36 +75,42 @@ func ParseHTML(body, baseURL string) (title, description, text string, hrefs []s
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 提取文本节点(<title> 和正文内容)
|
||||||
if n.Type == html.TextNode && n.Parent != nil {
|
if n.Type == html.TextNode && n.Parent != nil {
|
||||||
parentTag := ""
|
parentTag := ""
|
||||||
if n.Parent.Type == html.ElementNode {
|
if n.Parent.Type == html.ElementNode {
|
||||||
parentTag = strings.ToLower(n.Parent.Data)
|
parentTag = strings.ToLower(n.Parent.Data)
|
||||||
}
|
}
|
||||||
|
// 跳过 script/style/svg 内的文本
|
||||||
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
|
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
|
||||||
goto children
|
goto children
|
||||||
}
|
}
|
||||||
|
// 空白压缩并去除首尾空格
|
||||||
s := wsRe.ReplaceAllString(n.Data, " ")
|
s := wsRe.ReplaceAllString(n.Data, " ")
|
||||||
s = strings.TrimSpace(s)
|
s = strings.TrimSpace(s)
|
||||||
if s != "" {
|
if s != "" {
|
||||||
if parentTag == "title" {
|
if parentTag == "title" {
|
||||||
title = s
|
title = s // 标题只取第一个
|
||||||
} else {
|
} else {
|
||||||
textParts = append(textParts, s)
|
textParts = append(textParts, s) // 正文片段收集
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
children:
|
children:
|
||||||
|
// 递归遍历子节点
|
||||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||||
dfs(c)
|
dfs(c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
dfs(doc)
|
dfs(doc)
|
||||||
|
|
||||||
|
// 将正文片段用空格连接为完整文本
|
||||||
text = strings.Join(textParts, " ")
|
text = strings.Join(textParts, " ")
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// attrVal 提取 HTML 节点上指定名称的属性值(不区分大小写)。
|
||||||
func attrVal(n *html.Node, key string) string {
|
func attrVal(n *html.Node, key string) string {
|
||||||
for _, a := range n.Attr {
|
for _, a := range n.Attr {
|
||||||
if strings.ToLower(a.Key) == key {
|
if strings.ToLower(a.Key) == key {
|
||||||
@@ -99,6 +120,8 @@ func attrVal(n *html.Node, key string) string {
|
|||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// baseFromURL 从原始 URL 提取 "scheme://host" 部分(不含路径)。
|
||||||
|
// 例如:"https://example.com/path/page" → "https://example.com"。
|
||||||
func baseFromURL(rawURL string) string {
|
func baseFromURL(rawURL string) string {
|
||||||
idx := strings.Index(rawURL, "://")
|
idx := strings.Index(rawURL, "://")
|
||||||
if idx < 0 {
|
if idx < 0 {
|
||||||
@@ -107,11 +130,13 @@ func baseFromURL(rawURL string) string {
|
|||||||
rest := rawURL[idx+3:]
|
rest := rawURL[idx+3:]
|
||||||
slash := strings.Index(rest, "/")
|
slash := strings.Index(rest, "/")
|
||||||
if slash < 0 {
|
if slash < 0 {
|
||||||
return rawURL
|
return rawURL // 无路径,直接返回整个 URL
|
||||||
}
|
}
|
||||||
return rawURL[:idx+3+slash]
|
return rawURL[:idx+3+slash]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// pathFromURL 从原始 URL 提取路径部分(不含域名)。
|
||||||
|
// 例如:"https://example.com/path/page?q=1#top" → "/path/page"。
|
||||||
func pathFromURL(rawURL string) string {
|
func pathFromURL(rawURL string) string {
|
||||||
idx := strings.Index(rawURL, "://")
|
idx := strings.Index(rawURL, "://")
|
||||||
if idx < 0 {
|
if idx < 0 {
|
||||||
@@ -122,32 +147,33 @@ func pathFromURL(rawURL string) string {
|
|||||||
if slash < 0 {
|
if slash < 0 {
|
||||||
return "/"
|
return "/"
|
||||||
}
|
}
|
||||||
p := rest[slash:]
|
p := rest[slash:] // 从第一个斜杠开始即为路径
|
||||||
// strip query/fragment
|
// 去除查询字符串和锚点
|
||||||
p = strings.SplitN(p, "?", 2)[0]
|
p = strings.SplitN(p, "?", 2)[0]
|
||||||
p = strings.SplitN(p, "#", 2)[0]
|
p = strings.SplitN(p, "#", 2)[0]
|
||||||
return p
|
return p
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// resolveURL 将相对 href 解析为绝对 URL,参考 base(协议+主机)和 basePath(当前页面路径)。
|
||||||
|
// 支持:http://、https:// 绝对 URL;// 协议相对 URL;/ 绝对路径;相对路径。
|
||||||
func resolveURL(base, basePath, href string) string {
|
func resolveURL(base, basePath, href string) string {
|
||||||
// Absolute URL
|
// 已经是绝对 URL,直接返回
|
||||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||||
return href
|
return href
|
||||||
}
|
}
|
||||||
// Protocol-relative
|
// 协议相对 URL(以 // 开头),补上协议
|
||||||
if strings.HasPrefix(href, "//") {
|
if strings.HasPrefix(href, "//") {
|
||||||
// extract scheme from base
|
|
||||||
idx := strings.Index(base, "://")
|
idx := strings.Index(base, "://")
|
||||||
if idx < 0 {
|
if idx < 0 {
|
||||||
return ""
|
return ""
|
||||||
}
|
}
|
||||||
return base[:idx+1] + href
|
return base[:idx+1] + href
|
||||||
}
|
}
|
||||||
// Absolute path
|
// 绝对路径(以 / 开头),拼接域名
|
||||||
if strings.HasPrefix(href, "/") {
|
if strings.HasPrefix(href, "/") {
|
||||||
return base + href
|
return base + href
|
||||||
}
|
}
|
||||||
// Relative path
|
// 相对路径:基于当前页面目录拼接
|
||||||
dir := path.Dir(basePath)
|
dir := path.Dir(basePath) // 提取当前页面的目录部分
|
||||||
return base + path.Clean(dir+"/"+href)
|
return base + path.Clean(dir+"/"+href) // path.Clean 规范化,去除多余的 ../ 等
|
||||||
}
|
}
|
||||||
|
|||||||
+119
-79
@@ -1,35 +1,36 @@
|
|||||||
// Package search implements the user-facing search HTTP server.
|
// Package search implements the user-facing search HTTP server.
|
||||||
|
// search 包对外提供 HTTP 搜索服务,接收查询请求并返回按多因子排序的搜索结果。
|
||||||
package search
|
package search
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"container/heap"
|
"container/heap" // 堆结构(域名交错排序)
|
||||||
"encoding/json"
|
"encoding/json" // JSON 序列化(响应输出)
|
||||||
"log"
|
"log" // 日志
|
||||||
"math"
|
"math" // 数学运算(Log、幂)
|
||||||
"net/http"
|
"net/http" // HTTP 服务端
|
||||||
"net/url"
|
"net/url" // URL 解析
|
||||||
"regexp"
|
"regexp" // 正则表达式(site: 过滤语法)
|
||||||
"sort"
|
"sort" // 排序
|
||||||
"strings"
|
"strings" // 字符串操作
|
||||||
"sync"
|
"sync" // 互斥锁(保护并发切片写入)
|
||||||
"time"
|
"time" // 时间戳
|
||||||
|
|
||||||
"sese-engine/analyzer"
|
"sese-engine/analyzer" // 分词和语种检测
|
||||||
"sese-engine/config"
|
"sese-engine/config" // 排序权重配置
|
||||||
"sese-engine/info"
|
"sese-engine/info" // info 服务
|
||||||
"sese-engine/parser"
|
"sese-engine/parser" // HTML 解析(在线摘要)
|
||||||
"sese-engine/storage"
|
"sese-engine/storage" // 持久化存储
|
||||||
)
|
)
|
||||||
|
|
||||||
// Server is the search HTTP server.
|
// Server 是搜索 HTTP 服务器。
|
||||||
type Server struct {
|
type Server struct {
|
||||||
db *storage.DB
|
db *storage.DB
|
||||||
infoSvc *info.Service
|
infoSvc *info.Service
|
||||||
analyzer *analyzer.Analyzer
|
analyzer *analyzer.Analyzer
|
||||||
httpCli *http.Client // for online snippet fetching
|
httpCli *http.Client // 在线摘要抓取(无 robots.txt 检查)
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a search Server.
|
// New 创建一个 search Server。
|
||||||
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||||
return &Server{
|
return &Server{
|
||||||
db: db,
|
db: db,
|
||||||
@@ -41,51 +42,59 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handler returns the http.Handler.
|
// Handler 返回 HTTP 路由处理器。
|
||||||
func (s *Server) Handler() http.Handler {
|
func (s *Server) Handler() http.Handler {
|
||||||
mux := http.NewServeMux()
|
mux := http.NewServeMux()
|
||||||
mux.HandleFunc("/search", s.handleSearch)
|
mux.HandleFunc("/search", s.handleSearch)
|
||||||
return mux
|
return mux
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListenAndServe starts the search server.
|
// ListenAndServe 启动搜索服务器。
|
||||||
func (s *Server) ListenAndServe(addr string) error {
|
func (s *Server) ListenAndServe(addr string) error {
|
||||||
log.Printf("[search] listening on %s", addr)
|
log.Printf("[search] listening on %s", addr)
|
||||||
return http.ListenAndServe(addr, s.Handler())
|
return http.ListenAndServe(addr, s.Handler())
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- search handler ----
|
// ---- 搜索处理器 ----
|
||||||
|
|
||||||
|
// searchResponse 是搜索 API 的 JSON 响应结构。
|
||||||
type searchResponse struct {
|
type searchResponse struct {
|
||||||
Tokens []string `json:"tokens"`
|
Tokens []string `json:"tokens"` // 查询的分词结果
|
||||||
Counts map[string]int `json:"counts"`
|
Counts map[string]int `json:"counts"` // 每个词在索引中出现的 URL 数量
|
||||||
Results []searchResult `json:"results"`
|
Results []searchResult `json:"results"` // 排序后的搜索结果列表
|
||||||
Total int `json:"total"`
|
Total int `json:"total"` // 符合 site: 过滤条件前的总候选数
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// searchResult 是单条搜索结果。
|
||||||
type searchResult struct {
|
type searchResult struct {
|
||||||
Score float64 `json:"score"`
|
Score float64 `json:"score"` // 综合排序分数
|
||||||
URL string `json:"url"`
|
URL string `json:"url"` // 页面 URL
|
||||||
Snippet *snippetInfo `json:"snippet,omitempty"`
|
Snippet *snippetInfo `json:"snippet,omitempty"` // 摘要信息(标题/描述/正文)
|
||||||
Relevance map[string]float64 `json:"relevance"`
|
Relevance map[string]float64 `json:"relevance"` // 每个关键词在该 URL 下的权重
|
||||||
DomainCount int `json:"domain_count"`
|
DomainCount int `json:"domain_count"` // 该 URL 所属域名的总候选数
|
||||||
Factors map[string]float64 `json:"factors,omitempty"`
|
Factors map[string]float64 `json:"factors,omitempty"` // 各排序因子的详细分数
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// snippetInfo 封装页面摘要的标题、描述和正文片段。
|
||||||
type snippetInfo struct {
|
type snippetInfo struct {
|
||||||
Title string `json:"title"`
|
Title string `json:"title"` // 页面标题
|
||||||
Description string `json:"description"`
|
Description string `json:"description"` // meta description
|
||||||
Text string `json:"text"`
|
Text string `json:"text"` // 正文前 256 字符
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// siteRe 用于匹配 site: 过滤语法的正则(支持 site:example.com 语法)。
|
||||||
var siteRe = regexp.MustCompile(`^site:(.+)$`)
|
var siteRe = regexp.MustCompile(`^site:(.+)$`)
|
||||||
|
|
||||||
|
// handleSearch 处理 GET /search 请求。
|
||||||
|
// 参数:q(查询词),qh(URL 编码的查询词),slice(分页范围,格式 "from:to")。
|
||||||
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
w.Header().Set("Access-Control-Allow-Origin", "*") // 允许跨域
|
||||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||||
|
|
||||||
|
// 获取查询词
|
||||||
q := r.URL.Query().Get("q")
|
q := r.URL.Query().Get("q")
|
||||||
if q == "" {
|
if q == "" {
|
||||||
|
// qh:URL 编码的查询词(用于含特殊字符的查询)
|
||||||
if qh := r.URL.Query().Get("qh"); qh != "" {
|
if qh := r.URL.Query().Get("qh"); qh != "" {
|
||||||
decoded, err := url.PathUnescape(qh)
|
decoded, err := url.PathUnescape(qh)
|
||||||
if err == nil {
|
if err == nil {
|
||||||
@@ -94,7 +103,7 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse slice param "0:10"
|
// 解析分页参数(格式 "0:10")
|
||||||
sliceStr := r.URL.Query().Get("slice")
|
sliceStr := r.URL.Query().Get("slice")
|
||||||
sliceFrom, sliceTo := 0, 10
|
sliceFrom, sliceTo := 0, 10
|
||||||
if sliceStr != "" {
|
if sliceStr != "" {
|
||||||
@@ -108,29 +117,30 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Parse tokens and site filter
|
// 解析查询分词,并提取 site: 过滤条件
|
||||||
var tokens []string
|
var tokens []string
|
||||||
var siteFilter string
|
var siteFilter string
|
||||||
for _, part := range strings.Fields(q) {
|
for _, part := range strings.Fields(q) {
|
||||||
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
|
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
|
||||||
siteFilter = m[1]
|
siteFilter = m[1] // site:example.com 提取目标主机名
|
||||||
} else {
|
} else {
|
||||||
segs := s.analyzer.Segment(part, false)
|
segs := s.analyzer.Segment(part, false)
|
||||||
for _, t := range segs {
|
for _, t := range segs {
|
||||||
if !s.infoSvc.IsBlocked(t) {
|
if !s.infoSvc.IsBlocked(t) { // 过滤屏蔽词
|
||||||
tokens = append(tokens, t)
|
tokens = append(tokens, t)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 最多保留 20 个词(避免查询过于宽泛)
|
||||||
if len(tokens) > 20 {
|
if len(tokens) > 20 {
|
||||||
tokens = tokens[:20]
|
tokens = tokens[:20]
|
||||||
}
|
}
|
||||||
|
|
||||||
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
|
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
|
||||||
|
|
||||||
// Count per keyword
|
// 统计每个词命中的 URL 数量(供前端展示)
|
||||||
counts := make(map[string]int, len(tokens))
|
counts := make(map[string]int, len(tokens))
|
||||||
for _, t := range tokens {
|
for _, t := range tokens {
|
||||||
entries, _ := s.db.GetIndex(t)
|
entries, _ := s.db.GetIndex(t)
|
||||||
@@ -146,21 +156,23 @@ func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
|||||||
json.NewEncoder(w).Encode(resp)
|
json.NewEncoder(w).Encode(resp)
|
||||||
}
|
}
|
||||||
|
|
||||||
// query executes the multi-keyword search and returns ranked results.
|
// query 执行多关键词搜索,返回排序后的结果列表。
|
||||||
|
// 搜索流程:加载倒排索引 → 构建 URL 候选集 → 多因子评分 → 域名交错重排 → 截取分页。
|
||||||
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
|
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
|
||||||
if len(tokens) == 0 {
|
if len(tokens) == 0 {
|
||||||
return nil, 0
|
return nil, 0
|
||||||
}
|
}
|
||||||
|
|
||||||
// Load inverted index for each token
|
// 加载每个词对应的倒排索引条目
|
||||||
type tokenIndex struct {
|
type tokenIndex struct {
|
||||||
token string
|
token string
|
||||||
entries []storage.IndexEntry
|
entries []storage.IndexEntry
|
||||||
defVal float64
|
defVal float64 // 缺省权重(词在索引中条目已满时使用)
|
||||||
}
|
}
|
||||||
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
||||||
for _, t := range tokens {
|
for _, t := range tokens {
|
||||||
entries, _ := s.db.GetIndex(t)
|
entries, _ := s.db.GetIndex(t)
|
||||||
|
// 计算缺省权重:当条目数达到上限时,权重低于第 MaxURLsPerKey 名的条目使用缺省权重
|
||||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
||||||
if len(entries) >= config.MaxURLsPerKey {
|
if len(entries) >= config.MaxURLsPerKey {
|
||||||
weights := make([]float64, len(entries))
|
weights := make([]float64, len(entries))
|
||||||
@@ -173,7 +185,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build URL → per-token weights map
|
// 构建 URL → (词 → 权重) 映射,收集所有候选 URL
|
||||||
urlWeights := make(map[string]map[string]float64)
|
urlWeights := make(map[string]map[string]float64)
|
||||||
for _, ti := range tokenIndexes {
|
for _, ti := range tokenIndexes {
|
||||||
for _, e := range ti.entries {
|
for _, e := range ti.entries {
|
||||||
@@ -184,7 +196,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Site filter
|
// site: 过滤
|
||||||
total := len(urlWeights)
|
total := len(urlWeights)
|
||||||
if siteFilter != "" {
|
if siteFilter != "" {
|
||||||
filtered := make(map[string]map[string]float64)
|
filtered := make(map[string]map[string]float64)
|
||||||
@@ -198,15 +210,16 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
total = len(urlWeights)
|
total = len(urlWeights)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build default value map
|
// 构建每个词对应的缺省权重 map
|
||||||
defVals := make(map[string]float64, len(tokenIndexes))
|
defVals := make(map[string]float64, len(tokenIndexes))
|
||||||
for _, ti := range tokenIndexes {
|
for _, ti := range tokenIndexes {
|
||||||
defVals[ti.token] = ti.defVal
|
defVals[ti.token] = ti.defVal
|
||||||
}
|
}
|
||||||
|
|
||||||
// Compute relevance + initial score for each URL
|
// 计算每个 URL 的相关性和初始分数
|
||||||
candidates := make([]candidate, 0, len(urlWeights))
|
candidates := make([]candidate, 0, len(urlWeights))
|
||||||
for u, vs := range urlWeights {
|
for u, vs := range urlWeights {
|
||||||
|
// 词权重相乘(贝叶斯概率近似),缺省权重填充缺失词
|
||||||
rel := 1.0
|
rel := 1.0
|
||||||
for _, ti := range tokenIndexes {
|
for _, ti := range tokenIndexes {
|
||||||
vp := vs[ti.token]
|
vp := vs[ti.token]
|
||||||
@@ -218,34 +231,37 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
}
|
}
|
||||||
rel *= vp
|
rel *= vp
|
||||||
}
|
}
|
||||||
|
// 反向链接繁荣加分
|
||||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
||||||
bad := badURL(u)
|
bad := badURL(u)
|
||||||
adjust := s.infoSvc.Adjust(netloc(u))
|
adjust := s.infoSvc.Adjust(netloc(u))
|
||||||
|
// 基础分数 = 相关性 × 繁荣值 × URL质量 × 人工调整
|
||||||
score := rel * prosper * (1 - bad) * adjust * 0.1
|
score := rel * prosper * (1 - bad) * adjust * 0.1
|
||||||
|
|
||||||
|
// 12 维分数向量:分别记录各项因子,供后续多阶段调整
|
||||||
var vec [12]float64
|
var vec [12]float64
|
||||||
vec[0] = score
|
vec[0] = score // 0: 综合分数
|
||||||
vec[1] = rel
|
vec[1] = rel // 1: 相关性
|
||||||
vec[2] = prosper
|
vec[2] = prosper // 2: 繁荣值
|
||||||
vec[3] = 1 - bad
|
vec[3] = 1 - bad // 3: URL 质量
|
||||||
vec[4] = 1 // language multiplier placeholder
|
vec[4] = 1 // 4: 语种倍数(待填充)
|
||||||
vec[5] = 1 // repetition placeholder
|
vec[5] = 1 // 5: 重复惩罚(待填充)
|
||||||
vec[6] = adjust
|
vec[6] = adjust // 6: 人工调整
|
||||||
vec[7] = 1 // time multiplier placeholder
|
vec[7] = 1 // 7: 网站时间衰减(待填充)
|
||||||
vec[8] = 1 // consecutive keyword placeholder
|
vec[8] = 1 // 8: 连续词加成(待填充)
|
||||||
vec[9] = 1 // keyword content placeholder
|
vec[9] = 1 // 9: 关键词内容(预留)
|
||||||
vec[10] = 1 // URL time placeholder
|
vec[10] = 1 // 10: URL 时间衰减(待填充)
|
||||||
vec[11] = 0.1
|
vec[11] = 0.1 // 11: 常数因子
|
||||||
|
|
||||||
candidates = append(candidates, candidate{u, rel, vec})
|
candidates = append(candidates, candidate{u, rel, vec})
|
||||||
}
|
}
|
||||||
|
|
||||||
// Early relevance threshold
|
// 初步排序
|
||||||
sort.Slice(candidates, func(i, j int) bool {
|
sort.Slice(candidates, func(i, j int) bool {
|
||||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||||
})
|
})
|
||||||
|
|
||||||
// Apply site info factors to top 256
|
// 阶段一:加载网站信息,计算语种倍数和时间衰减(Top 256 并发)
|
||||||
now := time.Now().Unix()
|
now := time.Now().Unix()
|
||||||
limit256 := 256
|
limit256 := 256
|
||||||
if len(candidates) < 256 {
|
if len(candidates) < 256 {
|
||||||
@@ -264,6 +280,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
timeMul := timeMul(siteInfo, now)
|
timeMul := timeMul(siteInfo, now)
|
||||||
urlTimeMul := urlTimeMul(s.db, c.url, now)
|
urlTimeMul := urlTimeMul(s.db, c.url, now)
|
||||||
|
|
||||||
|
// 更新综合分数和各项因子
|
||||||
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
|
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
|
||||||
c.scoreVec[4] = langMul
|
c.scoreVec[4] = langMul
|
||||||
c.scoreVec[7] = timeMul
|
c.scoreVec[7] = timeMul
|
||||||
@@ -276,7 +293,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||||
})
|
})
|
||||||
|
|
||||||
// Apply consecutive-keyword and repetition bonuses to top 80
|
// 阶段二:连续词加成和标题重复惩罚(Top 80)
|
||||||
limit80 := 80
|
limit80 := 80
|
||||||
if len(candidates) < 80 {
|
if len(candidates) < 80 {
|
||||||
limit80 = len(candidates)
|
limit80 = len(candidates)
|
||||||
@@ -289,7 +306,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Repetition penaliser
|
// 重复惩罚:与前序结果标题相似度过高则降权
|
||||||
for i := 0; i < limit80; i++ {
|
for i := 0; i < limit80; i++ {
|
||||||
h := repetitionSimilarity(titles, i)
|
h := repetitionSimilarity(titles, i)
|
||||||
consecutive := consecutiveCount(titles[i], tokens)
|
consecutive := consecutiveCount(titles[i], tokens)
|
||||||
@@ -297,6 +314,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
if h > 0.5 {
|
if h > 0.5 {
|
||||||
repMul = 1 - (h - 0.5)
|
repMul = 1 - (h - 0.5)
|
||||||
}
|
}
|
||||||
|
// 连续词出现越多,乘以 config.ConsecutiveKeyWeight(>1)加成
|
||||||
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
||||||
candidates[i].scoreVec[0] *= repMul * consMul
|
candidates[i].scoreVec[0] *= repMul * consMul
|
||||||
candidates[i].scoreVec[5] = repMul
|
candidates[i].scoreVec[5] = repMul
|
||||||
@@ -307,10 +325,10 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||||
})
|
})
|
||||||
|
|
||||||
// Re-rank: interleave domains
|
// 阶段三:域名交错重排(使结果更丰富多样)
|
||||||
reranked := rerank(candidates, from, to)
|
reranked := rerank(candidates, from, to)
|
||||||
|
|
||||||
// Fetch snippets and build output
|
// 并发获取摘要
|
||||||
results := make([]searchResult, 0, len(reranked))
|
results := make([]searchResult, 0, len(reranked))
|
||||||
var snippetMu sync.Mutex
|
var snippetMu sync.Mutex
|
||||||
var snippetWg sync.WaitGroup
|
var snippetWg sync.WaitGroup
|
||||||
@@ -348,7 +366,7 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
}
|
}
|
||||||
snippetWg.Wait()
|
snippetWg.Wait()
|
||||||
|
|
||||||
// Preserve order (goroutines may reorder)
|
// 保持 rerank 的原始顺序(并发写入打乱了顺序)
|
||||||
urlOrder := make(map[string]int)
|
urlOrder := make(map[string]int)
|
||||||
for i, c := range reranked {
|
for i, c := range reranked {
|
||||||
urlOrder[c.url] = i
|
urlOrder[c.url] = i
|
||||||
@@ -360,9 +378,9 @@ func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]sear
|
|||||||
return results, total
|
return results, total
|
||||||
}
|
}
|
||||||
|
|
||||||
// getSnippet fetches (or caches) a snippet for a URL.
|
// getSnippet 获取某 URL 的摘要,优先从缓存读取,缓存未命中则在线抓取。
|
||||||
func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||||
// Try cache first
|
// 优先读缓存
|
||||||
if entry, err := s.db.GetSnippet(rawURL); err == nil {
|
if entry, err := s.db.GetSnippet(rawURL); err == nil {
|
||||||
snip := buildSnippet(entry)
|
snip := buildSnippet(entry)
|
||||||
return snip
|
return snip
|
||||||
@@ -370,7 +388,7 @@ func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
|||||||
if !config.UseOnlineSnippet {
|
if !config.UseOnlineSnippet {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
|
// 在线抓取(不使用 robots.txt,适用于搜索摘要场景)
|
||||||
req, err := http.NewRequest("GET", rawURL, nil)
|
req, err := http.NewRequest("GET", rawURL, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil
|
return nil
|
||||||
@@ -409,8 +427,10 @@ func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- scoring helpers ----
|
// ---- 评分辅助函数 ----
|
||||||
|
|
||||||
|
// languageMultiplier 根据网站的语种分布计算语种倍数。
|
||||||
|
// 中文占比越高且无关语言占比越低,则倍数越高(加分);否则降权。
|
||||||
func languageMultiplier(si *storage.SiteInfo) float64 {
|
func languageMultiplier(si *storage.SiteInfo) float64 {
|
||||||
if si == nil || len(si.Languages) == 0 {
|
if si == nil || len(si.Languages) == 0 {
|
||||||
return 1.0
|
return 1.0
|
||||||
@@ -424,27 +444,29 @@ func languageMultiplier(si *storage.SiteInfo) float64 {
|
|||||||
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// timeMul 根据网站最后访问时间计算时间衰减倍数(越久远衰减越多)。
|
||||||
func timeMul(si *storage.SiteInfo, now int64) float64 {
|
func timeMul(si *storage.SiteInfo, now int64) float64 {
|
||||||
if si == nil {
|
if si == nil {
|
||||||
return 1.0
|
return 1.0
|
||||||
}
|
}
|
||||||
t := si.LastVisitTime
|
t := si.LastVisitTime
|
||||||
if t == 0 {
|
if t == 0 {
|
||||||
t = 1648000000
|
t = 1648000000 // 默认时间戳(2022 年初)
|
||||||
}
|
}
|
||||||
days := (now - t) / (3600 * 24)
|
days := (now - t) / (3600 * 24)
|
||||||
if days < 0 {
|
if days < 0 {
|
||||||
days = 0
|
days = 0
|
||||||
}
|
}
|
||||||
if days > 180 {
|
if days > 180 {
|
||||||
days = 180
|
days = 180 // 最多衰减到约半年前
|
||||||
}
|
}
|
||||||
if days > 0 {
|
if days > 0 {
|
||||||
days--
|
days-- // 跳过第一天
|
||||||
}
|
}
|
||||||
return math.Pow(config.WeightDailyDecay, float64(days))
|
return math.Pow(config.WeightDailyDecay, float64(days))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// urlTimeMul 根据该 URL 的摘要抓取时间计算时间衰减倍数(30 天内不衰减)。
|
||||||
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||||
entry, err := db.GetSnippet(rawURL)
|
entry, err := db.GetSnippet(rawURL)
|
||||||
if err != nil || entry == nil {
|
if err != nil || entry == nil {
|
||||||
@@ -457,6 +479,7 @@ func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
|||||||
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// badURL 返回 URL 的"劣质"评分(0~0.9)。
|
||||||
func badURL(u string) float64 {
|
func badURL(u string) float64 {
|
||||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||||
@@ -471,6 +494,7 @@ func badURL(u string) float64 {
|
|||||||
return math.Min(s, 0.9)
|
return math.Min(s, 0.9)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// netloc 从 URL 提取主机名。
|
||||||
func netloc(rawURL string) string {
|
func netloc(rawURL string) string {
|
||||||
parts := strings.SplitN(rawURL, "/", 4)
|
parts := strings.SplitN(rawURL, "/", 4)
|
||||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||||
@@ -479,6 +503,7 @@ func netloc(rawURL string) string {
|
|||||||
return rawURL
|
return rawURL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// matchSite 判断主机名是否匹配 site: 过滤模式(支持子域名)。
|
||||||
func matchSite(host, pattern string) bool {
|
func matchSite(host, pattern string) bool {
|
||||||
if host == pattern {
|
if host == pattern {
|
||||||
return true
|
return true
|
||||||
@@ -489,6 +514,7 @@ func matchSite(host, pattern string) bool {
|
|||||||
return false
|
return false
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// consecutiveCount 统计标题中连续词对出现的次数(用于连续词加成)。
|
||||||
func consecutiveCount(title string, tokens []string) int {
|
func consecutiveCount(title string, tokens []string) int {
|
||||||
c := 0
|
c := 0
|
||||||
for i := 0; i < len(tokens)-1; i++ {
|
for i := 0; i < len(tokens)-1; i++ {
|
||||||
@@ -499,6 +525,8 @@ func consecutiveCount(title string, tokens []string) int {
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// repetitionSimilarity 计算标题与前序所有标题的最大相似度(基于编辑距离)。
|
||||||
|
// 相似度 > 0.5 的结果将被降权。
|
||||||
func repetitionSimilarity(titles []string, idx int) float64 {
|
func repetitionSimilarity(titles []string, idx int) float64 {
|
||||||
if idx == 0 {
|
if idx == 0 {
|
||||||
return 0
|
return 0
|
||||||
@@ -520,6 +548,7 @@ func repetitionSimilarity(titles []string, idx int) float64 {
|
|||||||
return best
|
return best
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// levenshtein 计算两个字符串的编辑距离(动态规划)。
|
||||||
func levenshtein(a, b string) int {
|
func levenshtein(a, b string) int {
|
||||||
ra := []rune(a)
|
ra := []rune(a)
|
||||||
rb := []rune(b)
|
rb := []rune(b)
|
||||||
@@ -562,13 +591,16 @@ func min3(a, b, c int) int {
|
|||||||
return c
|
return c
|
||||||
}
|
}
|
||||||
|
|
||||||
// rerank interleaves results from different domains.
|
// ---- 域名交错重排 ----
|
||||||
|
|
||||||
|
// rerank 使用堆结构对候选结果按域名交错排列,使不同域名的 URL 交替出现。
|
||||||
|
// 每个域名的第二次出现分数乘以 1/8,第三次 1/64,以此类推,确保结果多样性。
|
||||||
type domainHeap []rerankItem
|
type domainHeap []rerankItem
|
||||||
|
|
||||||
type rerankItem struct {
|
type rerankItem struct {
|
||||||
score float64
|
score float64
|
||||||
url string
|
url string
|
||||||
domainMul float64
|
domainMul float64 // 域名衰减倍数
|
||||||
vec [12]float64
|
vec [12]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -584,26 +616,29 @@ func (h *domainHeap) Pop() interface{} {
|
|||||||
return x
|
return x
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// candidate 是候选 URL 的内部表示。
|
||||||
type candidate struct {
|
type candidate struct {
|
||||||
url string
|
url string
|
||||||
relevance float64
|
relevance float64
|
||||||
scoreVec [12]float64
|
scoreVec [12]float64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// rerank 对候选列表进行域名交错重排,返回分页范围内的结果。
|
||||||
func rerank(candidates []candidate, from, to int) []candidate {
|
func rerank(candidates []candidate, from, to int) []candidate {
|
||||||
|
// 按域名分组
|
||||||
domainItems := make(map[string][]candidate)
|
domainItems := make(map[string][]candidate)
|
||||||
for _, c := range candidates {
|
for _, c := range candidates {
|
||||||
h := netloc(c.url)
|
h := netloc(c.url)
|
||||||
domainItems[h] = append(domainItems[h], c)
|
domainItems[h] = append(domainItems[h], c)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 每个域名的 URL 列表取最后一个(分数最高)放入堆,其余保留
|
||||||
h := &domainHeap{}
|
h := &domainHeap{}
|
||||||
heap.Init(h)
|
heap.Init(h)
|
||||||
domainMul := make(map[string]float64)
|
domainMul := make(map[string]float64)
|
||||||
|
|
||||||
for domain, items := range domainItems {
|
for domain, items := range domainItems {
|
||||||
domainMul[domain] = 1.0
|
domainMul[domain] = 1.0
|
||||||
// Sort items within domain
|
|
||||||
sort.Slice(items, func(i, j int) bool {
|
sort.Slice(items, func(i, j int) bool {
|
||||||
return items[i].scoreVec[0] < items[j].scoreVec[0]
|
return items[i].scoreVec[0] < items[j].scoreVec[0]
|
||||||
})
|
})
|
||||||
@@ -612,6 +647,7 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
|||||||
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
|
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 从堆中依次弹出得分最高的条目(受域名衰减影响),直到取够
|
||||||
var result []candidate
|
var result []candidate
|
||||||
for h.Len() > 0 && len(result) < to {
|
for h.Len() > 0 && len(result) < to {
|
||||||
item := heap.Pop(h).(rerankItem)
|
item := heap.Pop(h).(rerankItem)
|
||||||
@@ -619,7 +655,7 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
|||||||
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
||||||
}
|
}
|
||||||
domain := netloc(item.url)
|
domain := netloc(item.url)
|
||||||
domainMul[domain] /= 8
|
domainMul[domain] /= 8 // 该域名的下一次出现衰减到 1/8
|
||||||
remaining := domainItems[domain]
|
remaining := domainItems[domain]
|
||||||
if len(remaining) > 0 {
|
if len(remaining) > 0 {
|
||||||
next := remaining[len(remaining)-1]
|
next := remaining[len(remaining)-1]
|
||||||
@@ -630,8 +666,9 @@ func rerank(candidates []candidate, from, to int) []candidate {
|
|||||||
return result
|
return result
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- misc ----
|
// ---- 杂项辅助函数 ----
|
||||||
|
|
||||||
|
// readBodyLimited 从 HTTP 响应体读取最多 limit 字节(用于限制在线摘要抓取大小)。
|
||||||
func readBodyLimited(resp *http.Response, limit int64) string {
|
func readBodyLimited(resp *http.Response, limit int64) string {
|
||||||
data := make([]byte, 0, limit)
|
data := make([]byte, 0, limit)
|
||||||
buf := make([]byte, 4096)
|
buf := make([]byte, 4096)
|
||||||
@@ -652,6 +689,7 @@ func readBodyLimited(resp *http.Response, limit int64) string {
|
|||||||
return string(data)
|
return string(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// truncate 将字符串截断到最多 n 个字符。
|
||||||
func truncate(s string, n int) string {
|
func truncate(s string, n int) string {
|
||||||
if len(s) <= n {
|
if len(s) <= n {
|
||||||
return s
|
return s
|
||||||
@@ -659,6 +697,7 @@ func truncate(s string, n int) string {
|
|||||||
return s[:n]
|
return s[:n]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// unescapeURL 对 URL 进行解码(%XX 转义)。
|
||||||
func unescapeURL(u string) string {
|
func unescapeURL(u string) string {
|
||||||
decoded, err := url.PathUnescape(u)
|
decoded, err := url.PathUnescape(u)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -667,6 +706,7 @@ func unescapeURL(u string) string {
|
|||||||
return decoded
|
return decoded
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// atoi 手写字符串转整数(不含负数和浮点)。
|
||||||
func atoi(s string) int {
|
func atoi(s string) int {
|
||||||
n := 0
|
n := 0
|
||||||
for _, c := range s {
|
for _, c := range s {
|
||||||
|
|||||||
+97
-80
@@ -1,65 +1,68 @@
|
|||||||
// Package storage provides the persistent index and site-info storage backed by bbolt.
|
// Package storage provides the persistent index and site-info storage backed by bbolt.
|
||||||
|
// storage 包提供基于 bbolt 的持久化存储,负责保存倒排索引、URL摘要缓存和网站元信息。
|
||||||
//
|
//
|
||||||
// Index space → a single bbolt bucket "index" where key = keyword (string),
|
// 索引空间(index bucket):key = 关键词(string),value = brotli 压缩的 JSON 数组,每项为 [权重, URL] 对。
|
||||||
// value = brotli-compressed JSON array of [weight, url] pairs.
|
// 融合之门(gate bucket):key = URL(string),value = brotli 压缩的 JSON 数组 [标题, 描述, 正文, 时间戳]。
|
||||||
|
// 网站之门(site_gate bucket):key = 主机名(string),value = brotli 压缩的 JSON SiteInfo 结构体。
|
||||||
//
|
//
|
||||||
// Gate (门) → a bbolt bucket "gate" where key = URL (string),
|
// Python 版使用自定义哈希桶结构;Go 版直接交由 bbolt 原生处理。
|
||||||
// value = brotli-compressed JSON array [title, desc, text, timestamp].
|
|
||||||
//
|
|
||||||
// SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string),
|
|
||||||
// value = brotli-compressed JSON of SiteInfo struct.
|
|
||||||
//
|
|
||||||
// The Python version used a custom hash-bucket scheme; here bbolt handles it natively.
|
|
||||||
package storage
|
package storage
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"encoding/json" // JSON 序列化/反序列化
|
||||||
"fmt"
|
"fmt" // 格式化错误信息
|
||||||
"io"
|
"io" // io.EOF 常量
|
||||||
"os"
|
"os" // 操作系统功能(创建目录等)
|
||||||
"path/filepath"
|
"path/filepath" // 路径拼接
|
||||||
|
|
||||||
"github.com/andybalholm/brotli"
|
"github.com/andybalholm/brotli" // Brotli 无损压缩库(用于压缩存储数据)
|
||||||
bolt "go.etcd.io/bbolt"
|
bolt "go.etcd.io/bbolt" // BoltDB,纯 Go 嵌入式 KV 数据库
|
||||||
)
|
)
|
||||||
|
|
||||||
// IndexEntry is a single entry in the inverted index.
|
// IndexEntry 是倒排索引中的单个条目。
|
||||||
|
// 一条索引记录表示"某个 URL 与某个关键词的相关性权重"。
|
||||||
type IndexEntry struct {
|
type IndexEntry struct {
|
||||||
Weight float32 `json:"w"`
|
Weight float32 `json:"w"` // 该 URL 在该关键词下的得分/权重
|
||||||
URL string `json:"u"`
|
URL string `json:"u"` // 网页 URL
|
||||||
}
|
}
|
||||||
|
|
||||||
// SnippetEntry is cached snippet data for a URL.
|
// SnippetEntry 是 URL 对应的摘要信息缓存。
|
||||||
|
// 包含页面标题、描述、正文片段和抓取时间戳。
|
||||||
type SnippetEntry struct {
|
type SnippetEntry struct {
|
||||||
Title string `json:"title"`
|
Title string `json:"title"` // 网页标题
|
||||||
Description string `json:"desc"`
|
Description string `json:"desc"` // meta description 或自动生成的描述
|
||||||
Text string `json:"text"`
|
Text string `json:"text"` // 正文前 N 字符的文本片段
|
||||||
Timestamp int64 `json:"ts"`
|
Timestamp int64 `json:"ts"` // 抓取该页面时的 Unix 时间戳
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 三个 bbolt bucket 的名称(以字节数组存储,bbolt 要求 key/value 均为字节)
|
||||||
var (
|
var (
|
||||||
bucketIndex = []byte("index")
|
bucketIndex = []byte("index") // 倒排索引 bucket
|
||||||
bucketGate = []byte("gate")
|
bucketGate = []byte("gate") // URL 摘要缓存 bucket
|
||||||
bucketSiteGate = []byte("site_gate")
|
bucketSiteGate = []byte("site_gate") // 网站元信息 bucket
|
||||||
)
|
)
|
||||||
|
|
||||||
// DB wraps a bbolt database and exposes typed access methods.
|
// DB 封装一个 bbolt 数据库,提供类型化的存取接口。
|
||||||
// bbolt handles its own locking internally.
|
// bbolt 内部已实现并发安全,无需额外加锁。
|
||||||
type DB struct {
|
type DB struct {
|
||||||
db *bolt.DB
|
db *bolt.DB // 底层 bbolt 数据库句柄
|
||||||
}
|
}
|
||||||
|
|
||||||
// Open creates or opens the bbolt database at the given directory path.
|
// Open 在指定目录路径下创建或打开 bbolt 数据库文件。
|
||||||
|
// 如果目录不存在会自动创建。数据库文件名为 sese.db。
|
||||||
func Open(dir string) (*DB, error) {
|
func Open(dir string) (*DB, error) {
|
||||||
|
// 确保存储目录存在(0775 权限:所有者读写执行,组用户读执行,其他读执行)
|
||||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||||
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
|
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
|
||||||
}
|
}
|
||||||
|
// 拼接数据库文件路径:dir/sese.db
|
||||||
path := filepath.Join(dir, "sese.db")
|
path := filepath.Join(dir, "sese.db")
|
||||||
|
// 打开/创建数据库文件,文件权限 0600(仅所有者可读写)
|
||||||
db, err := bolt.Open(path, 0o600, nil)
|
db, err := bolt.Open(path, 0o600, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, fmt.Errorf("storage.Open bolt: %w", err)
|
return nil, fmt.Errorf("storage.Open bolt: %w", err)
|
||||||
}
|
}
|
||||||
// Ensure buckets exist
|
// 启动时确保三个 bucket 都存在(不存在则创建)
|
||||||
err = db.Update(func(tx *bolt.Tx) error {
|
err = db.Update(func(tx *bolt.Tx) error {
|
||||||
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
|
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
|
||||||
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
|
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
|
||||||
@@ -74,16 +77,18 @@ func Open(dir string) (*DB, error) {
|
|||||||
return &DB{db: db}, nil
|
return &DB{db: db}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close closes the underlying bbolt database.
|
// Close 关闭底层 bbolt 数据库连接。
|
||||||
func (d *DB) Close() error {
|
func (d *DB) Close() error {
|
||||||
return d.db.Close()
|
return d.db.Close()
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- helpers ----
|
// ---- 辅助函数:压缩与解压 ----
|
||||||
|
|
||||||
|
// compress 将字节数组用 brotli 压缩后返回。
|
||||||
|
// brotli 压缩比高于 gzip,适合大量文本的存储空间优化。
|
||||||
func compress(data []byte) ([]byte, error) {
|
func compress(data []byte) ([]byte, error) {
|
||||||
buf := make([]byte, 0, len(data))
|
buf := make([]byte, 0, len(data)) // 预分配,避免反复扩容
|
||||||
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6)
|
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6) // 压缩级别 6(平衡速度和压缩比)
|
||||||
if _, err := w.Write(data); err != nil {
|
if _, err := w.Write(data); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
@@ -93,78 +98,85 @@ func compress(data []byte) ([]byte, error) {
|
|||||||
return buf, nil
|
return buf, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// decompress 将 brotli 压缩的字节数组解压还原。
|
||||||
func decompress(data []byte) ([]byte, error) {
|
func decompress(data []byte) ([]byte, error) {
|
||||||
|
// brotli.NewReader 从字节数组创建读取器(通过 byteReader 适配 io.Reader 接口)
|
||||||
r := brotli.NewReader(
|
r := brotli.NewReader(
|
||||||
(*byteReader)(&data),
|
(*byteReader)(&data),
|
||||||
)
|
)
|
||||||
out := make([]byte, 0, len(data)*3)
|
out := make([]byte, 0, len(data)*3) // 预分配约 3 倍空间(解压后通常更大)
|
||||||
tmp := make([]byte, 4096)
|
tmp := make([]byte, 4096) // 每次最多读 4KB
|
||||||
for {
|
for {
|
||||||
n, err := r.Read(tmp)
|
n, err := r.Read(tmp)
|
||||||
out = append(out, tmp[:n]...)
|
out = append(out, tmp[:n]...) // 追加本次读取的字节
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == io.EOF {
|
if err == io.EOF {
|
||||||
break
|
break // 正常读完
|
||||||
}
|
}
|
||||||
return out, err
|
return out, err // 其他错误(非 EOF)则返回
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return out, nil
|
return out, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// appendWriter implements io.Writer on top of a *[]byte.
|
// appendWriter 将 *[]byte 适配为 io.Writer 接口(写入时直接 append)。
|
||||||
type appendWriter []byte
|
type appendWriter []byte
|
||||||
|
|
||||||
|
// Write 将数据 p 追加到 appendWriter 末尾,返回写入字节数。
|
||||||
func (a *appendWriter) Write(p []byte) (int, error) {
|
func (a *appendWriter) Write(p []byte) (int, error) {
|
||||||
*a = append(*a, p...)
|
*a = append(*a, p...)
|
||||||
return len(p), nil
|
return len(p), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// byteReader wraps []byte as io.Reader.
|
// byteReader 将 []byte 适配为 io.Reader 接口(顺序读取,支持读完后返回 EOF)。
|
||||||
type byteReader []byte
|
type byteReader []byte
|
||||||
|
|
||||||
|
// Read 从字节数组读取最多 len(p) 字节到 p 中,返回实际读取字节数和可能的错误。
|
||||||
|
// 当字节数组全部读完后返回 io.EOF。
|
||||||
func (b *byteReader) Read(p []byte) (int, error) {
|
func (b *byteReader) Read(p []byte) (int, error) {
|
||||||
if len(*b) == 0 {
|
if len(*b) == 0 {
|
||||||
return 0, io.EOF
|
return 0, io.EOF // 已读完
|
||||||
}
|
}
|
||||||
n := copy(p, *b)
|
n := copy(p, *b) // 复制最多 len(p) 字节
|
||||||
*b = (*b)[n:]
|
*b = (*b)[n:] // 前进指针
|
||||||
return n, nil
|
return n, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// marshalCompress 将任意可序列化对象先 JSON 编码,再 brotli 压缩,返回压缩后的字节。
|
||||||
func marshalCompress(v any) ([]byte, error) {
|
func marshalCompress(v any) ([]byte, error) {
|
||||||
raw, err := json.Marshal(v)
|
raw, err := json.Marshal(v) // 先序列化为 JSON
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
}
|
}
|
||||||
return compress(raw)
|
return compress(raw) // 再压缩
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// decompressUnmarshal 将压缩字节先解压,再 JSON 反序列化到目标对象 v。
|
||||||
func decompressUnmarshal(data []byte, v any) error {
|
func decompressUnmarshal(data []byte, v any) error {
|
||||||
raw, err := decompress(data)
|
raw, err := decompress(data) // 先解压
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return json.Unmarshal(raw, v)
|
return json.Unmarshal(raw, v) // 再反序列化
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- Index (inverted index) ----
|
// ---- 倒排索引(Index)相关方法 ----
|
||||||
|
|
||||||
// GetIndex retrieves all IndexEntry values for a keyword.
|
// GetIndex 根据关键词查询倒排索引,返回该词关联的所有 [权重, URL] 条目列表。
|
||||||
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
|
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
|
||||||
var entries []IndexEntry
|
var entries []IndexEntry
|
||||||
err := d.db.View(func(tx *bolt.Tx) error {
|
err := d.db.View(func(tx *bolt.Tx) error {
|
||||||
b := tx.Bucket(bucketIndex)
|
b := tx.Bucket(bucketIndex)
|
||||||
v := b.Get([]byte(keyword))
|
v := b.Get([]byte(keyword)) // 在 index bucket 中按关键词查询
|
||||||
if v == nil {
|
if v == nil {
|
||||||
return nil
|
return nil // 不存在该词,返回空列表
|
||||||
}
|
}
|
||||||
return decompressUnmarshal(v, &entries)
|
return decompressUnmarshal(v, &entries)
|
||||||
})
|
})
|
||||||
return entries, err
|
return entries, err
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetIndex overwrites the IndexEntry list for a keyword.
|
// SetIndex 将某关键词的完整索引条目列表覆盖写入(替换旧数据)。
|
||||||
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
|
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
|
||||||
data, err := marshalCompress(entries)
|
data, err := marshalCompress(entries)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -175,7 +187,8 @@ func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// BatchSetIndex writes multiple keyword→entries pairs in one transaction.
|
// BatchSetIndex 在一次事务中批量写入多个关键词→条目列表的映射。
|
||||||
|
// 比多次调用 SetIndex 效率更高(减少事务开销)。
|
||||||
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
|
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
|
||||||
return d.db.Update(func(tx *bolt.Tx) error {
|
return d.db.Update(func(tx *bolt.Tx) error {
|
||||||
b := tx.Bucket(bucketIndex)
|
b := tx.Bucket(bucketIndex)
|
||||||
@@ -192,22 +205,24 @@ func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ForEachIndex iterates over all index entries. fn receives keyword and entries.
|
// ForEachIndex 遍历倒排索引中所有关键词及其关联条目,对每个条目调用 fn。
|
||||||
|
// 用于全量读取索引、做备份或重新计算等场景。
|
||||||
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
|
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
|
||||||
return d.db.View(func(tx *bolt.Tx) error {
|
return d.db.View(func(tx *bolt.Tx) error {
|
||||||
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
|
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
|
||||||
var entries []IndexEntry
|
var entries []IndexEntry
|
||||||
if err := decompressUnmarshal(v, &entries); err != nil {
|
if err := decompressUnmarshal(v, &entries); err != nil {
|
||||||
return nil // skip corrupted entries
|
return nil // 跳过损坏条目,不中断遍历
|
||||||
}
|
}
|
||||||
return fn(string(k), entries)
|
return fn(string(k), entries)
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- Gate (URL snippet cache) ----
|
// ---- 融合之门(Gate):URL 摘要缓存相关方法 ----
|
||||||
|
|
||||||
// GetSnippet retrieves the cached snippet for a URL.
|
// GetSnippet 根据 URL 查询缓存的摘要信息(标题/描述/正文片段)。
|
||||||
|
// 若未命中返回 error。
|
||||||
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
|
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
|
||||||
var entry SnippetEntry
|
var entry SnippetEntry
|
||||||
err := d.db.View(func(tx *bolt.Tx) error {
|
err := d.db.View(func(tx *bolt.Tx) error {
|
||||||
@@ -223,7 +238,7 @@ func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
|
|||||||
return &entry, nil
|
return &entry, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetSnippet stores a cached snippet for a URL.
|
// SetSnippet 将某 URL 的摘要信息写入缓存(覆盖已有数据)。
|
||||||
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
|
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
|
||||||
data, err := marshalCompress(entry)
|
data, err := marshalCompress(entry)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -234,26 +249,27 @@ func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ---- SiteGate (site metadata) ----
|
// ---- 网站之门(SiteGate):网站元信息相关方法 ----
|
||||||
|
|
||||||
// SiteInfo mirrors the Python 网站 dataclass.
|
// SiteInfo 存放每个域名/主机的元信息,与 Python 版网站.py 的 dataclass 对应。
|
||||||
type SiteInfo struct {
|
type SiteInfo struct {
|
||||||
VisitCount int `json:"visit_count"`
|
VisitCount int `json:"visit_count"` // 累计访问该网站的次数
|
||||||
LastVisitTime int64 `json:"last_visit_time"`
|
LastVisitTime int64 `json:"last_visit_time"` // 上次访问该网站的时间戳
|
||||||
Fingerprint any `json:"fingerprint,omitempty"`
|
Fingerprint any `json:"fingerprint,omitempty"` // 网站指纹(用于识别重复站点)
|
||||||
SuccessRate *float64 `json:"success_rate,omitempty"`
|
SuccessRate *float64 `json:"success_rate,omitempty"` // 访问成功率(成功次数/总访问次数)
|
||||||
HTMLStructure string `json:"html_structure,omitempty"`
|
HTMLStructure string `json:"html_structure,omitempty"` // HTML 结构特征摘要
|
||||||
IPs []string `json:"ips,omitempty"`
|
IPs []string `json:"ips,omitempty"` // 该域名解析出的 IP 列表
|
||||||
Quality *float64 `json:"quality,omitempty"`
|
Quality *float64 `json:"quality,omitempty"` // 网站质量评分(0~1)
|
||||||
HTTPSAvailable *bool `json:"https_available,omitempty"`
|
HTTPSAvailable *bool `json:"https_available,omitempty"` // 是否支持 HTTPS
|
||||||
Keywords []string `json:"keywords,omitempty"`
|
Keywords []string `json:"keywords,omitempty"` // 该网站的高频关键词列表
|
||||||
OutLinks []string `json:"out_links,omitempty"`
|
OutLinks []string `json:"out_links,omitempty"` // 从该网站页面提取的出站链接列表
|
||||||
Languages map[string]float64 `json:"languages,omitempty"`
|
Languages map[string]float64 `json:"languages,omitempty"` // 网站语种分布(语种代码 → 占比)
|
||||||
Redirects map[string]string `json:"redirects,omitempty"`
|
Redirects map[string]string `json:"redirects,omitempty"` // 重定向链(URL → 最终 URL)
|
||||||
ServerTypes []string `json:"server_types,omitempty"`
|
ServerTypes []string `json:"server_types,omitempty"` // 网站使用的 HTTP Server 类型列表
|
||||||
}
|
}
|
||||||
|
|
||||||
// GetSiteInfo retrieves metadata for a hostname.
|
// GetSiteInfo 根据主机名查询网站元信息。
|
||||||
|
// 若不存在则返回仅有默认空 map 的空 SiteInfo(不报错,方便调用方直接使用)。
|
||||||
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
||||||
var info SiteInfo
|
var info SiteInfo
|
||||||
err := d.db.View(func(tx *bolt.Tx) error {
|
err := d.db.View(func(tx *bolt.Tx) error {
|
||||||
@@ -264,6 +280,7 @@ func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
|||||||
return decompressUnmarshal(v, &info)
|
return decompressUnmarshal(v, &info)
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
// 未找到时返回带默认空 map 的空结构,避免调用方空指针 panic
|
||||||
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
|
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
|
||||||
}
|
}
|
||||||
if info.Languages == nil {
|
if info.Languages == nil {
|
||||||
@@ -275,7 +292,7 @@ func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
|||||||
return &info, nil
|
return &info, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// SetSiteInfo stores metadata for a hostname.
|
// SetSiteInfo 将某主机名的网站元信息写入存储(覆盖已有数据)。
|
||||||
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
|
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
|
||||||
data, err := marshalCompress(info)
|
data, err := marshalCompress(info)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -286,13 +303,13 @@ func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
// ForEachSite iterates over all site metadata entries.
|
// ForEachSite 遍历所有网站元信息条目,对每个条目调用 fn。
|
||||||
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
|
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
|
||||||
return d.db.View(func(tx *bolt.Tx) error {
|
return d.db.View(func(tx *bolt.Tx) error {
|
||||||
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
|
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
|
||||||
var info SiteInfo
|
var info SiteInfo
|
||||||
if err := decompressUnmarshal(v, &info); err != nil {
|
if err := decompressUnmarshal(v, &info); err != nil {
|
||||||
return nil
|
return nil // 跳过损坏条目
|
||||||
}
|
}
|
||||||
return fn(string(k), &info)
|
return fn(string(k), &info)
|
||||||
})
|
})
|
||||||
|
|||||||
Reference in New Issue
Block a user