Signed-off-by: 吴文峰 <kevin@lmve.net>

This commit is contained in:
2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
+1
View File
@@ -0,0 +1 @@
savedata
+89
View File
@@ -0,0 +1,89 @@
# sese-engine Go 重构版
Python 原版的 Go 语言重构,使用标准英文命名,单二进制部署。
## 目录结构
```
golang/
├── main.go # 主入口,goroutine 启动所有模块
├── go.mod
├── config/
│ └── config.go # 全局配置参数(对应 配置.py)
├── storage/
│ └── storage.go # bbolt 持久化层(对应 存储.py,替换 rimo_storage
├── crawler/
│ ├── crawler.go # BFS 爬虫调度(对应 上网.py)
│ └── fetcher.go # HTTP 获取 + robots.txt + 限流(对应 虫.py
├── parser/
│ └── parser.go # HTML 解析(对应 文.py
├── analyzer/
│ └── analyzer.go # 分词 + 关键词权重(对应 分析.py + utils.py 分词部分)
│ 使用 gojieba(中文)+ gofasttext(语言检测)
├── harvester/
│ └── harvester.go # 索引写入服务,监听 :5000(对应 收获服务器.py)
├── search/
│ └── server.go # 搜索 API,监听 :80(对应 人服务器.py)
├── backlink/
│ └── backlink.go # 反向链接计算,每 48h 运行(对应 回.py)
└── info/
└── info.go # 繁荣表 / 调整表 / 屏蔽词加载(对应 信息.py)
```
## 依赖项
| Go 包 | 替代 Python 包 | 用途 |
|-------|--------------|------|
| `github.com/yanyiwu/gojieba` | `jieba` | 中文分词 |
| `github.com/nicholasgasior/gofasttext` | `fasttext` | 语言检测 |
| `go.etcd.io/bbolt` | `rimo_storage` | KV 存储 / 倒排索引 |
| `github.com/andybalholm/brotli` | `brotli` | 压缩 |
| `golang.org/x/net/html` | `lxml` | HTML 解析 |
| `golang.org/x/net/html/charset` | chardet | 编码检测 |
## 构建与运行
```bash
cd golang
# 下载依赖(需要 CGo 编译器,用于 gojieba / gofasttext
go mod tidy
# 构建
go build -o sese-engine .
# 运行(在 sese-engine 项目根目录下)
cd ..
./golang/sese-engine \
--storage ./savedata \
--entry https://zh.wikipedia.org/ \
--fasttext ./lid.176.ftz \
--stopwords ./data/标点符号.json
```
一个进程启动所有模块:
- `:5000` — 收获服务器(爬虫推送关键词)
- `:80` — 搜索 API`GET /search?q=关键词`
- 后台 goroutine — BFS 爬虫
- 后台 goroutine — 每 48 小时反向链接计算
## 与 Python 版的主要差异
| 方面 | Python 版 | Go 版 |
|------|---------|-------|
| 并发 | GIL + 线程池(假并发) | goroutine 真并发 |
| 存储 | rimo_storage(自研)| bbolt(嵌入式 KV |
| 部署 | 需要 Python 环境 | 单二进制,无运行时依赖 |
| 命名 | 全中文 | 标准英文 |
| 进程数 | 3~4 个进程 | 1 个进程多 goroutine |
| 编码检测 | requests 自动检测 | `golang.org/x/net/html/charset` |
| Prometheus | 可选 | 暂未集成(可后续添加) |
## 注意事项
1. **CGo 依赖**gojieba 和 gofasttext 均需要 C/C++ 编译器(gcc/clang)。
Windows 下建议使用 MinGW 或 WSL。
2. **fasttext 模型**`lid.176.ftz` 需要与 Python 版共用,路径通过 `--fasttext` 指定。
3. **数据迁移**:存储格式(bbolt JSON)与 Python 版(rimo_storage 二进制)不兼容,
需要全新爬取,或编写迁移脚本。
4. **stop words 文件**:复用 Python 版的 `data/标点符号.json`
+250
View File
@@ -0,0 +1,250 @@
// Package analyzer provides keyword extraction and language detection.
//
// Keyword extraction uses gojieba for Chinese segmentation and simple token
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
package analyzer
import (
"encoding/json"
"math"
"os"
"strings"
"sync"
"unicode"
"github.com/pemistahl/lingua-go"
"github.com/yanyiwu/gojieba"
)
// Keyword holds a (word, weight) pair.
type Keyword struct {
Word string `json:"word"`
Weight float32 `json:"weight"`
}
// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
type Analyzer struct {
jieba *gojieba.Jieba
detector lingua.LanguageDetector
stopWords map[string]bool
mu sync.Mutex // gojieba is not goroutine-safe
}
// New creates an Analyzer.
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
j := gojieba.NewJieba()
// Build a lingua detector that covers the languages we care about.
// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
detector := lingua.NewLanguageDetectorBuilder().
FromAllLanguages().
WithMinimumRelativeDistance(0.15).
Build()
stopWords := loadStopWords(stopWordsPath)
return &Analyzer{
jieba: j,
detector: detector,
stopWords: stopWords,
}, nil
}
// Close releases resources held by the analyzer.
func (a *Analyzer) Close() {
a.jieba.Free()
}
// loadStopWords reads a JSON array of stop-word strings.
func loadStopWords(path string) map[string]bool {
if path == "" {
return map[string]bool{}
}
f, err := os.Open(path)
if err != nil {
return map[string]bool{}
}
defer f.Close()
var words []string
if err := json.NewDecoder(f).Decode(&words); err != nil {
return map[string]bool{}
}
m := make(map[string]bool, len(words))
for _, w := range words {
m[strings.ToLower(w)] = true
}
return m
}
// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
if len(s) > 10000 {
s = s[:10000]
}
// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
s = strings.ToValidUTF8(s, "")
var result []string
for _, part := range strings.Fields(s) {
if isASCIIAlnum(part) {
result = append(result, part)
} else {
a.mu.Lock()
var tokens []string
if searchMode {
tokens = a.jieba.CutForSearch(part, true)
} else {
tokens = a.jieba.Cut(part, true)
}
a.mu.Unlock()
result = append(result, tokens...)
}
}
return result
}
// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
func Normalize(s string) string {
var b strings.Builder
for _, r := range s {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
if r >= 'A' && r <= 'Z' {
b.WriteRune(unicode.ToLower(r))
} else {
b.WriteRune(r)
}
}
}
return b.String()
}
// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
tokens := a.Tokenize(text, false)
d := make(map[string]float32)
n := math.Max(8, float64(len(tokens)))
counts := make(map[string]int)
for _, t := range tokens {
t = Normalize(t)
if t == "" || a.stopWords[t] || len(t) > 32 {
continue
}
counts[t]++
}
for k, v := range counts {
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
}
return d
}
// Analyze extracts weighted keywords from title, description, and body text.
// Returns a slice sorted by weight descending.
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
maps := []map[string]float32{
a.weightedTokens(title, 1.0),
a.weightedTokens(description, 0.5),
a.weightedTokens(text, 1.0),
}
combined := make(map[string]float32)
for _, m := range maps {
for k := range m {
combined[k] = 0
}
}
for k := range combined {
for _, m := range maps {
combined[k] += m[k]
}
}
result := make([]Keyword, 0, len(combined))
for k, v := range combined {
result = append(result, Keyword{Word: k, Weight: v})
}
sortKeywords(result)
return result
}
// Segment returns search-mode tokens for a query string.
func (a *Analyzer) Segment(query string, searchMode bool) []string {
tokens := a.Tokenize(query, searchMode)
var result []string
for _, t := range tokens {
t = Normalize(t)
if t == "" || a.stopWords[t] || len(t) > 32 {
continue
}
result = append(result, t)
}
return result
}
// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
// Returns "" for unknown or unsupported languages.
var linguaToISO639 = map[lingua.Language]string{
lingua.Chinese: "zh",
lingua.English: "en",
lingua.Japanese: "ja",
lingua.Korean: "ko",
lingua.French: "fr",
lingua.German: "de",
lingua.Spanish: "es",
lingua.Portuguese: "pt",
lingua.Italian: "it",
lingua.Russian: "ru",
lingua.Arabic: "ar",
lingua.Hindi: "hi",
lingua.Dutch: "nl",
lingua.Polish: "pl",
lingua.Swedish: "sv",
lingua.Turkish: "tr",
lingua.Vietnamese: "vi",
lingua.Thai: "th",
lingua.Indonesian: "id",
lingua.Malay: "ms",
}
// DetectLanguage returns the ISO 639-1 language code for the text, or "".
func (a *Analyzer) DetectLanguage(text string) string {
text = strings.ReplaceAll(text, "\n", " ")
if len(text) > 2000 {
text = text[:2000]
}
if text == "" {
return ""
}
lang, exists := a.detector.DetectLanguageOf(text)
if !exists {
return ""
}
if code, ok := linguaToISO639[lang]; ok {
return code
}
return ""
}
// ---- sorting ----
func sortKeywords(kws []Keyword) {
for i := 1; i < len(kws); i++ {
key := kws[i]
j := i - 1
for j >= 0 && kws[j].Weight < key.Weight {
kws[j+1] = kws[j]
j--
}
kws[j+1] = key
}
}
func isASCIIAlnum(s string) bool {
for _, r := range s {
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
return false
}
}
return len(s) > 0
}
+533
View File
@@ -0,0 +1,533 @@
// Package backlink computes backlink (prosperity) scores for all known domains,
// using a PageRank-like algorithm over the site-level link graph.
//
// It runs every 48 hours and writes savedata/prosper.json.
package backlink
import (
"encoding/json"
"log"
"math"
"math/rand"
"os"
"path/filepath"
"strings"
"time"
"sese-engine/storage"
)
// Runner runs the backlink calculation loop.
type Runner struct {
db *storage.DB
storagePath string
}
// New creates a Runner.
func New(db *storage.DB, storagePath string) *Runner {
return &Runner{db: db, storagePath: storagePath}
}
// Run loops forever, recalculating every 48 hours.
func (r *Runner) Run() {
for {
// Sleep until next scheduled run (aligned to 2am)
now := time.Now()
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
if !target.After(now) {
target = target.Add(48 * time.Hour)
}
sleep := target.Sub(now)
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
time.Sleep(sleep)
log.Printf("[backlink] starting computation at %v", time.Now().Format(time.RFC3339))
if err := r.compute(); err != nil {
log.Printf("[backlink] error: %v", err)
} else {
log.Printf("[backlink] done")
}
}
}
// RunNow runs one computation cycle immediately (for testing / manual trigger).
func (r *Runner) RunNow() error {
return r.compute()
}
// ---- computation ----
type siteStats struct {
subdomainCount map[string]int // superDomain → count
templateCount map[string]int // htmlStructure → count
sameIPCount map[string]int // ipPrefix → count
serverCount map[string]int // serverType → count
}
func (r *Runner) compute() error {
stats := r.collectStats()
// Phase 1: HTTPS sites
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
}, stats, "https_backlink")
// Phase 1a: second pass (echo) using d1 scores
d1a := r.aggregateWithScores(d1, stats, "echo")
// Phase 2: HTTP-only sites
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
}, stats, "http_backlink")
// Merge
merged := make(map[string]float64)
for k := range union(d1, d2, d1a) {
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
if v > 0.16 {
merged[k] = v
}
}
// Save
path := filepath.Join(r.storagePath, "prosper.json")
if err := writeJSON(path, merged); err != nil {
return err
}
log.Printf("[backlink] wrote %d entries to %s", len(merged), path)
return nil
}
// collectStats builds statistics about the site graph.
func (r *Runner) collectStats() *siteStats {
stats := &siteStats{
subdomainCount: make(map[string]int),
templateCount: make(map[string]int),
sameIPCount: make(map[string]int),
serverCount: make(map[string]int),
}
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
super := superDomain(host)
stats.subdomainCount[super]++
if info.HTMLStructure != "" {
stats.templateCount[info.HTMLStructure]++
}
if len(info.IPs) > 0 {
ipStr := ipPrefix(info.IPs)
stats.sameIPCount[ipStr]++
}
if len(info.ServerTypes) > 0 {
s := strings.Join(sortedStrings(info.ServerTypes), ",")
stats.serverCount[s]++
}
return nil
})
// Prune counts below threshold
for k, v := range stats.subdomainCount {
if v < 4 {
delete(stats.subdomainCount, k)
}
}
for k, v := range stats.templateCount {
if v < 4 {
delete(stats.templateCount, k)
}
}
for k, v := range stats.sameIPCount {
if v < 4 {
delete(stats.sameIPCount, k)
}
}
return stats
}
// aggregate computes a backlink score map for sites matching the filter.
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
log.Printf("[backlink] aggregating: %s", desc)
d := make(map[string]float64)
ipSource := make(map[string]float64)
// Build server type index (top 63 most common)
serverTable := buildServerTable(stats.serverCount)
type vectorEntry struct {
domain string
vec []float32
}
vectors := make(map[string][]float32)
pruneThreshold := 0.02
i := 0
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
if filter != nil && !filter(info) {
return nil
}
mul := computeMul(host, info, stats)
if mul == 0 {
return nil
}
n := len(info.OutLinks)
if n == 0 {
return nil
}
w := 1.0 / math.Max(float64(n), 50)
xd := make(map[string]float64)
for _, link := range info.OutLinks {
for _, seg := range decomposeURL(link) {
if _, exists := xd[seg]; !exists {
xd[seg] = w
} else {
xd[seg] += w
}
}
}
ipStr := ipPrefix(info.IPs)
serverType := ""
if len(info.ServerTypes) > 0 {
serverType = info.ServerTypes[0]
}
serverID := serverTable[serverType]
for seg, segW := range xd {
fw := math.Min(segW, 0.15) * mul
prev := d[seg]
d[seg] = prev + fw
if prev > 0.2 {
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
key := seg + "-" + ipStr
if ipSource[key] > 0.4 {
continue
}
ipSource[key] += fw
}
}
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
if vectors[seg] == nil {
vectors[seg] = make([]float32, 64)
}
vectors[seg][serverID] += float32(fw)
}
}
i++
if i%200000 == 0 {
// Prune low-score entries
for k, v := range d {
if v < pruneThreshold {
delete(d, k)
}
}
pruneThreshold *= 1.1
}
if i%400000 == 0 {
for k, v := range ipSource {
if v < 0.04 {
delete(ipSource, k)
}
}
}
return nil
})
// Vectorised cosine filtering
d = vectorFilter(d, vectors, desc)
// Prune
for k, v := range d {
if v <= 0.16 {
delete(d, k)
}
}
log.Printf("[backlink] %s: %d entries", desc, len(d))
return d
}
// aggregateWithScores does a second pass weighted by existing scores.
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
log.Printf("[backlink] aggregating with scores: %s", desc)
d := make(map[string]float64)
serverTable := buildServerTable(stats.serverCount)
vectors := make(map[string][]float32)
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
score, ok := scores[host]
if !ok || strings.Contains(host, "/") {
return nil
}
mul := computeMul(host, info, stats)
if mul == 0 {
return nil
}
trueMul := math.Min(2, mul*math.Log2(2+score))
n := len(info.OutLinks)
if n == 0 {
return nil
}
w := 1.0 / math.Max(float64(n), 50)
xd := make(map[string]float64)
for _, link := range info.OutLinks {
for _, seg := range decomposeURL(link) {
xd[seg] += w
}
}
serverType := ""
if len(info.ServerTypes) > 0 {
serverType = info.ServerTypes[0]
}
serverID := serverTable[serverType]
for seg, segW := range xd {
fw := math.Min(segW, 0.15) * trueMul
d[seg] += fw
if d[seg] > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
if vectors[seg] == nil {
vectors[seg] = make([]float32, 64)
}
vectors[seg][serverID] += float32(fw)
}
}
return nil
})
d = vectorFilter(d, vectors, desc)
for k, v := range d {
if v <= 0.16 {
delete(d, k)
}
}
return d
}
// ---- vector cosine filtering ----
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
// Compute core vector (sum of all)
core := make([]float64, 64)
for _, vec := range vectors {
for j, v := range vec {
core[j] += float64(v)
}
}
coreNorm := norm64(core)
if coreNorm == 0 {
return d
}
newD := make(map[string]float64, len(d))
for k, v := range d {
baseK := strings.Split(k, "/")[0]
if v > 0.21 && vectors[baseK] != nil {
vec := vectors[baseK]
vecNorm := float64(norm32(vec))
if vecNorm == 0 {
newD[k] = v
continue
}
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
if cos > 1.01 {
cos = 1.01
}
newV := math.Max(v*(0.25+cos*0.75), 0.21)
newD[k] = newV
} else {
newD[k] = v
}
}
// Save cos map for diagnostics
cosMap := make(map[string]float64)
for k, vec := range vectors {
vn := float64(norm32(vec))
if vn > 0 {
cosMap[k] = dot32_64(vec, core) / (vn * coreNorm)
}
}
_ = writeJSON(desc+"_cos.json", cosMap)
return newD
}
// ---- helpers ----
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
if len(info.OutLinks) == 0 {
return 0
}
t := info.LastVisitTime
if t == 0 {
t = 1640000000
}
days := (time.Now().Unix() - t) / (3600 * 24)
if days > 180 {
return 0
}
timeMul := math.Pow(0.99, float64(days))
super := superDomain(host)
subCount := max(stats.subdomainCount[super], 1)
tplCount := 1
if info.HTMLStructure != "" {
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
}
count := max(subCount, int(float64(tplCount)*1.5))
if count > 1000 {
if rand.Float64() > 1000.0/float64(count) {
return 0
}
count = 1000
}
domainMul := 1.0 / math.Pow(math.Max(float64(count), 5)/5, 0.6)
return timeMul * domainMul
}
func superDomain(host string) string {
parts := strings.Split(host, ".")
if len(parts) >= 2 {
return strings.Join(parts[len(parts)-2:], ".")
}
return host
}
func ipPrefix(ips []string) string {
if len(ips) == 0 {
return ""
}
sorted := sortedStrings(ips)
parts := make([]string, len(sorted))
for i, ip := range sorted {
idx := strings.LastIndex(ip, ".")
if idx > 0 {
parts[i] = ip[:idx]
} else {
parts[i] = ip
}
}
return strings.Join(parts, ",")
}
func decomposeURL(rawURL string) []string {
u := strings.ToLower(rawURL)
if strings.HasPrefix(u, "https://") {
u = u[8:]
} else if strings.HasPrefix(u, "http://") {
u = u[7:]
} else {
return nil
}
u = strings.ReplaceAll(u, "?", "/")
u = strings.ReplaceAll(u, "#", "/")
u = strings.TrimRight(u, "/")
if u == "" || u[0] == '/' || u[0] == '%' {
return nil
}
parts := strings.Split(u, "/")
var out []string
current := parts[0]
out = append(out, current)
for _, p := range parts[1:] {
current = current + "/" + p
out = append(out, current)
}
return out
}
func buildServerTable(serverCount map[string]int) map[string]int {
type kv struct {
k string
v int
}
var sorted []kv
for k, v := range serverCount {
sorted = append(sorted, kv{k, v})
}
for i := 0; i < len(sorted)-1; i++ {
for j := i + 1; j < len(sorted); j++ {
if sorted[j].v > sorted[i].v {
sorted[i], sorted[j] = sorted[j], sorted[i]
}
}
}
table := make(map[string]int, 63)
limit := 63
if len(sorted) < limit {
limit = len(sorted)
}
for i := 0; i < limit; i++ {
table[sorted[i].k] = i + 1
}
return table
}
func sortedStrings(s []string) []string {
cp := make([]string, len(s))
copy(cp, s)
for i := 0; i < len(cp)-1; i++ {
for j := i + 1; j < len(cp); j++ {
if cp[j] < cp[i] {
cp[i], cp[j] = cp[j], cp[i]
}
}
}
return cp
}
func norm64(v []float64) float64 {
s := 0.0
for _, x := range v {
s += x * x
}
return math.Sqrt(s)
}
func norm32(v []float32) float32 {
s := float32(0)
for _, x := range v {
s += x * x
}
return float32(math.Sqrt(float64(s)))
}
func dot32_64(a []float32, b []float64) float64 {
s := 0.0
for i := range a {
s += float64(a[i]) * b[i]
}
return s
}
func union(maps ...map[string]float64) map[string]bool {
out := make(map[string]bool)
for _, m := range maps {
for k := range m {
out[k] = true
}
}
return out
}
func writeJSON(path string, data interface{}) error {
_ = os.MkdirAll(filepath.Dir(path), 0o755)
b, err := json.MarshalIndent(data, "", " ")
if err != nil {
return err
}
return os.WriteFile(path, b, 0o644)
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
+53
View File
@@ -0,0 +1,53 @@
// Package config holds all global configuration parameters for sese-engine.
package config
// Index / storage limits
const (
MaxURLsPerKey = 11000 // max URLs stored per index key
MaxSameDomainPerKey = 20 // max URLs from the same domain per key
BigCleanThreshold = 10000000 // flush in-memory index after this many rows
MaxNewURLsPerKey = 10000 // cap on new URLs added per key per flush
MinURLsForNewKey = 3 // discard new keys with fewer than this many URLs
)
// Crawler settings
const (
SpiderName = "loli_spider"
CrawlerCooldown = 3 // seconds between requests to the same host
CrawlerWorkers = 22 // goroutine pool size for crawling
CrawlFocus = 0.7 // concentration factor — higher = more focused on single domain
MaxKeywordsPerPage = 250
MaxEpoch = 100
ExpectedProsperRatio = 0.6 // fraction of queue that should be "prosperous" (high backlink) domains
EntryURL = "https://zh.wikipedia.org/"
)
// Search / ranking weights
const (
UseOnlineSnippet = true
OnlineSnippetTimeout = 3 // seconds
WeightDailyDecay = 0.996
LanguageWeight = 0.5
ConsecutiveKeyWeight = 1.3
BacklinkWeight = 1.0
SearchServerPort = 80
)
// Backlink computation
const (
BacklinkBaseline = 200000 // normalization divisor for backlink scores
)
// Storage path (relative to process working directory)
const StoragePath = "./savedata"
// Prometheus ports
const (
PromPortCrawler = 14950
PromPortHarvester = 14951
PromPortBacklink = 14952
PromPortSearch = 14953
)
// Harvester HTTP endpoint
const HarvesterAddr = "http://127.0.0.1:5000"
+588
View File
@@ -0,0 +1,588 @@
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
package crawler
import (
"bytes"
"encoding/json"
"log"
"math"
"math/rand"
"net/http"
"net/url"
"strings"
"sync"
"sync/atomic"
"time"
"sese-engine/analyzer"
"sese-engine/config"
"sese-engine/parser"
"sese-engine/storage"
)
// Stats holds real-time crawl counters (read with atomic).
type Stats struct {
VisitedURLs int64
SuccessURLs int64
KeywordsFetched int64
}
// Crawler orchestrates the BFS crawl.
type Crawler struct {
fetcher *Fetcher
db *storage.DB
analyzer *analyzer.Analyzer
prosperMap map[string]float64 // domain → backlink score (loaded from info)
stats Stats
}
// New creates a Crawler.
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
return &Crawler{
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
db: db,
analyzer: a,
prosperMap: prosperMap,
}
}
// URLWeight pairs a URL with its discovery weight.
type URLWeight struct {
URL string
Weight float64
}
// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
// It blocks until completion.
func (c *Crawler) Run(entryURL string, maxEpoch int) {
visited := make(map[string]bool)
queue := []string{entryURL}
for ep := 0; ep < maxEpoch; ep++ {
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
for _, u := range queue {
visited[u] = true
}
var (
newLinks []URLWeight
mu sync.Mutex
wg sync.WaitGroup
)
sem := make(chan struct{}, config.CrawlerWorkers)
for _, u := range queue {
wg.Add(1)
sem <- struct{}{}
go func(rawURL string) {
defer wg.Done()
defer func() { <-sem }()
hrefs := c.visitURL(rawURL)
n := len(hrefs)
if n > 0 {
w := 1.0 / float64(n)
mu.Lock()
for _, h := range hrefs {
if !visited[h] {
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
}
}
mu.Unlock()
}
}(u)
}
wg.Wait()
if len(newLinks) == 0 {
log.Println("[crawler] empty queue — stopping")
return
}
queue = c.schedule(newLinks)
}
}
// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
func (c *Crawler) visitURL(rawURL string) []string {
atomic.AddInt64(&c.stats.VisitedURLs, 1)
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
if err != nil || res == nil {
c.updateSiteFailure(rawURL)
return nil
}
atomic.AddInt64(&c.stats.SuccessURLs, 1)
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
// Cache snippet
if len(res.FinalURL) < 250 {
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
Title: title,
Description: truncate(desc, 256),
Text: truncate(text, 256),
Timestamp: time.Now().Unix(),
})
}
// Keyword extraction → send to harvester
kws := c.analyzer.Analyze(title, desc, text)
if len(kws) > 0 {
if len(kws) > config.MaxKeywordsPerPage {
kws = kws[:config.MaxKeywordsPerPage]
}
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
go c.sendToHarvester(res.FinalURL, kws)
}
// Update site info
host := netloc(res.FinalURL)
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
// Handle permanent redirects in site info
for from, to := range res.Redirects {
fromHost := netloc(from)
if fromHost == "" {
continue
}
info, _ := c.db.GetSiteInfo(fromHost)
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
info.Redirects[from] = to
if len(info.Redirects) > 50 {
// keep most important (just truncate randomly for now)
info.Redirects = truncateMap(info.Redirects, 40)
}
_ = c.db.SetSiteInfo(fromHost, info)
}
// Trim hrefs
if len(hrefs) > 100 {
hrefs = sampleStrings(hrefs, 100)
}
return hrefs
}
func (c *Crawler) updateSiteFailure(rawURL string) {
host := netloc(rawURL)
if host == "" {
return
}
info, _ := c.db.GetSiteInfo(host)
if info.SuccessRate == nil {
zero := 0.0
info.SuccessRate = &zero
}
*info.SuccessRate *= 0.99
_ = c.db.SetSiteInfo(host, info)
}
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
info, _ := c.db.GetSiteInfo(host)
info.VisitCount++
info.LastVisitTime = time.Now().Unix()
one := 1.0
if info.SuccessRate == nil {
info.SuccessRate = &one
}
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
if strings.HasPrefix(res.FinalURL, "https://") {
t := true
info.HTTPSAvailable = &t
}
if res.ServerType != "" {
found := false
for _, s := range info.ServerTypes {
if s == res.ServerType {
found = true
break
}
}
if !found {
info.ServerTypes = append(info.ServerTypes, res.ServerType)
if len(info.ServerTypes) > 5 {
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
}
}
}
// Language detection — sample 10% or first 10 visits
if info.VisitCount < 10 || rand.Float64() < 0.1 {
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
if lang != "" {
if info.Languages == nil {
info.Languages = make(map[string]float64)
}
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
for k := range info.Languages {
info.Languages[k] *= (1 - intensity)
}
info.Languages[lang] += intensity
}
// Collect external links
superHost := superNetloc(res.FinalURL)
var external []string
for _, h := range hrefs {
if superNetloc(h) != superHost {
external = append(external, h)
}
}
sampled := sampleStrings(external, 10)
info.OutLinks = append(info.OutLinks, sampled...)
if len(info.OutLinks) > 250 {
info.OutLinks = sampleStrings(info.OutLinks, 200)
}
}
_ = c.db.SetSiteInfo(host, info)
}
// sendToHarvester POSTs keyword data to the harvester service.
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
type payload struct {
URL string `json:"url"`
Keywords []analyzer.Keyword `json:"keywords"`
}
p := payload{URL: finalURL, Keywords: kws}
data, err := json.Marshal(p)
if err != nil {
return
}
resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data))
if err != nil {
log.Printf("[crawler] harvester post failed: %v", err)
return
}
resp.Body.Close()
}
// schedule selects and prioritises the next BFS queue from raw discovered links.
func (c *Crawler) schedule(links []URLWeight) []string {
if len(links) > 100000 {
links = sampleURLWeights(links, 100000)
}
// Pre-fetch site info for all involved domains
domains := make(map[string]bool)
for _, lw := range links {
if h := netloc(lw.URL); h != "" {
domains[h] = true
}
if h := superNetloc(lw.URL); h != "" {
domains[h] = true
}
}
siteCache := make(map[string]*storage.SiteInfo, len(domains))
var mu sync.Mutex
var wg sync.WaitGroup
for d := range domains {
wg.Add(1)
go func(host string) {
defer wg.Done()
info, _ := c.db.GetSiteInfo(host)
mu.Lock()
siteCache[host] = info
mu.Unlock()
}(d)
}
wg.Wait()
// Score each URL
scored_list := make([]scoredURL, len(links))
for i, lw := range links {
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
}
// Weighted random sample (45000 or 1/3+250 whichever smaller)
k := min(45000, len(scored_list)/3+250)
selected := weightedSample(scored_list, k)
// Domain concentration filtering
selected = concentrationFilter(selected, config.CrawlFocus)
// Separate https/http, cap http at 1/4 of https count
var httpsURLs, httpURLs []string
for _, s := range selected {
if strings.HasPrefix(s, "https://") {
httpsURLs = append(httpsURLs, s)
} else {
httpURLs = append(httpURLs, s)
}
}
maxHTTP := len(httpsURLs) / 4
if len(httpURLs) > maxHTTP {
httpURLs = sampleStrings(httpURLs, maxHTTP)
}
// Separate prosperous / non-prosperous
var prosperURLs, otherURLs []string
for _, u := range append(httpsURLs, httpURLs...) {
if c.prosperMap[netloc(u)] > 0 {
prosperURLs = append(prosperURLs, u)
} else {
otherURLs = append(otherURLs, u)
}
}
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
if len(otherURLs) > n {
keep := max(len(otherURLs)-len(selected)/10, n)
if keep < len(otherURLs) {
otherURLs = sampleStrings(otherURLs, keep)
}
}
result := append(prosperURLs, otherURLs...)
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
return result
}
// scoreURL computes the scheduling priority for a URL.
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
host := netloc(lw.URL)
super := superNetloc(lw.URL)
info := siteCache[host]
if info == nil {
info = &storage.SiteInfo{}
}
// Chinese-ness
var chineseness float64 = 0.5
if len(info.Languages) > 0 {
total := 0.0
for _, v := range info.Languages {
total += v
}
if total > 0 {
chineseness = info.Languages["zh"] / total
}
}
// Interest decay based on visit count
prosper := math.Min(62, c.prosperMap[host])
limit := prosper*500 + 50
b := math.Pow(0.1, 1/limit)
interest := math.Pow(b, float64(info.VisitCount))
var interest2 float64 = 1.0
if super != host {
superInfo := siteCache[super]
if superInfo != nil {
limit2 := math.Min(62, c.prosperMap[super])*500 + 50
b2 := math.Pow(0.1, 1/limit2)
interest2 = math.Pow(b2, float64(superInfo.VisitCount))
}
}
quality := 1.0
if info.Quality != nil {
quality = *info.Quality
}
prosperity := prosper
if prosperity > 0 {
prosperity += 0.5
}
prosperity = math.Log2(2+prosperity) + 1
bad := badURL(lw.URL)
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
}
// ---- helper functions ----
func netloc(rawURL string) string {
parts := strings.SplitN(rawURL, "/", 4)
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
return parts[2]
}
u, err := url.Parse(rawURL)
if err != nil {
return ""
}
return u.Host
}
// superNetloc returns "domain.tld" (strips subdomains).
func superNetloc(rawURL string) string {
host := netloc(rawURL)
parts := strings.Split(host, ".")
if len(parts) >= 2 {
return strings.Join(parts[len(parts)-2:], ".")
}
return host
}
func badURL(u string) float64 {
s := math.Max(0, float64(len(u)-30)/200.0)
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
s += (1 - s) * 0.3
}
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
s += (1 - s) * 0.1
}
if len(u) < 5 || u[4] == ':' {
s += (1 - s) * 0.3
}
return math.Min(s, 0.9)
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n]
}
func sampleStrings(s []string, n int) []string {
if len(s) <= n {
return s
}
perm := rand.Perm(len(s))
out := make([]string, n)
for i := range out {
out[i] = s[perm[i]]
}
return out
}
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
if len(s) <= n {
return s
}
perm := rand.Perm(len(s))
out := make([]URLWeight, n)
for i := range out {
out[i] = s[perm[i]]
}
return out
}
type scoredURL struct {
url string
score float64
}
func weightedSample(items []scoredURL, k int) []string {
if k >= len(items) {
out := make([]string, len(items))
for i, s := range items {
out[i] = s.url
}
return out
}
// Simple weighted sampling without replacement using alias method approximation
totalWeight := 0.0
for _, s := range items {
totalWeight += s.score
}
selected := make(map[int]bool)
out := make([]string, 0, k)
for len(out) < k && len(selected) < len(items) {
r := rand.Float64() * totalWeight
cum := 0.0
for i, s := range items {
if selected[i] {
continue
}
cum += s.score
if cum >= r {
selected[i] = true
out = append(out, s.url)
totalWeight -= s.score
break
}
}
}
return out
}
func concentrationFilter(urls []string, k float64) []string {
domainGroups := make(map[string][]string)
shuffled := make([]string, len(urls))
copy(shuffled, urls)
rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })
for _, u := range shuffled {
d := superNetloc(u)
domainGroups[d] = append(domainGroups[d], u)
}
limit := 10
if len(domainGroups) > 1 {
sizes := make([]int, 0, len(domainGroups))
for _, g := range domainGroups {
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
}
// sort sizes ascending, drop last (largest)
for i := 0; i < len(sizes)-1; i++ {
for j := i + 1; j < len(sizes)-1; j++ {
if sizes[j] < sizes[i] {
sizes[i], sizes[j] = sizes[j], sizes[i]
}
}
}
total := 0
for _, s := range sizes[:len(sizes)-1] {
total += s
}
limit = max(10, int(float64(total)*0.6))
}
var result []string
for _, g := range domainGroups {
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
if sn > len(g) {
sn = len(g)
}
result = append(result, g[:sn]...)
}
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
return result
}
func truncateMap(m map[string]string, n int) map[string]string {
if len(m) <= n {
return m
}
out := make(map[string]string, n)
i := 0
for k, v := range m {
if i >= n {
break
}
out[k] = v
i++
}
return out
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
// Expose Stats for monitoring.
func (c *Crawler) GetStats() Stats {
return Stats{
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
SuccessURLs: atomic.LoadInt64(&c.stats.SuccessURLs),
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
}
}
+313
View File
@@ -0,0 +1,313 @@
// Package crawler implements the HTTP fetching layer with robots.txt compliance,
// per-host rate limiting, redirect tracking, and encoding detection.
package crawler
import (
"fmt"
"io"
"net/http"
"net/url"
"strings"
"sync"
"time"
"golang.org/x/net/html/charset"
)
// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
type ErrCrawl struct {
Msg string
}
func (e *ErrCrawl) Error() string { return e.Msg }
// FetchResult bundles the result of a successful fetch.
type FetchResult struct {
Body string // decoded HTML body
FinalURL string // URL after redirects
Redirects map[string]string // permanent redirects: from → to
ServerType string
}
// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
type Fetcher struct {
client *http.Client
userAgent string
cooldown time.Duration
rateMu sync.Mutex
lastHit map[string]time.Time // host → last request time
robotsMu sync.Mutex
robots map[string]*robotsEntry // host → parsed robots
}
type robotsEntry struct {
rules []robotsRule
fetchedAt time.Time
}
type robotsRule struct {
userAgent string
disallow []string
allow []string
}
// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
return &Fetcher{
client: &http.Client{
Timeout: 30 * time.Second,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
return nil
},
},
userAgent: userAgent,
cooldown: cooldown,
lastHit: make(map[string]time.Time),
robots: make(map[string]*robotsEntry),
}
}
// Fetch fetches url, respecting robots.txt and rate limits.
// polite=false skips both checks (used by search server snippet fetcher).
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
}
// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
if _, ok := err.(*ErrCrawl); ok {
return nil, nil
}
return res, err
}
// fetchWithHistory does the actual request and populates redirect history.
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
parsed, err := url.Parse(rawURL)
if err != nil {
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
}
host := parsed.Host
if polite {
f.rateLimit(host)
if !f.robotsAllowed(rawURL, host) {
return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
}
}
redirects := make(map[string]string)
client := &http.Client{
Timeout: timeout,
CheckRedirect: func(req *http.Request, via []*http.Request) error {
if len(via) >= 10 {
return fmt.Errorf("too many redirects")
}
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
from := via[len(via)-1].URL.String()
to := req.URL.String()
redirects[from] = to
}
return nil
},
}
req, _ := http.NewRequest("GET", rawURL, nil)
req.Header.Set("User-Agent", f.userAgent)
resp, err := client.Do(req)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode == 404 {
return nil, &ErrCrawl{Msg: "404 not found"}
}
if resp.StatusCode >= 400 {
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
}
ct := resp.Header.Get("Content-Type")
if !strings.Contains(ct, "text/html") {
return nil, &ErrCrawl{Msg: "not html: " + ct}
}
body, err := decodeBody(resp.Body, ct, sizeLimit)
if err != nil {
return nil, err
}
return &FetchResult{
Body: body,
FinalURL: resp.Request.URL.String(),
Redirects: redirects,
ServerType: resp.Header.Get("Server"),
}, nil
}
// rateLimit sleeps if the last request to host was too recent.
func (f *Fetcher) rateLimit(host string) {
f.rateMu.Lock()
last, ok := f.lastHit[host]
now := time.Now()
f.lastHit[host] = now
// Periodically prune the map
if len(f.lastHit) > 10000 {
cutoff := now.Add(-f.cooldown * 2)
for k, v := range f.lastHit {
if v.Before(cutoff) {
delete(f.lastHit, k)
}
}
}
f.rateMu.Unlock()
if ok {
elapsed := now.Sub(last)
if elapsed < f.cooldown {
time.Sleep(f.cooldown - elapsed)
}
}
}
// robotsAllowed returns true if rawURL is crawlable.
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
f.robotsMu.Lock()
entry, ok := f.robots[host]
f.robotsMu.Unlock()
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
entry = f.fetchRobots(host, rawURL)
f.robotsMu.Lock()
f.robots[host] = entry
f.robotsMu.Unlock()
}
parsed, err := url.Parse(rawURL)
if err != nil {
return false
}
path := parsed.Path
if path == "" {
path = "/"
}
for _, rule := range entry.rules {
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
continue
}
// Check allow first (higher priority)
for _, a := range rule.allow {
if strings.HasPrefix(path, a) {
return true
}
}
for _, dis := range rule.disallow {
if dis != "" && strings.HasPrefix(path, dis) {
return false
}
}
}
return true
}
// fetchRobots downloads and parses robots.txt for a host.
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
entry := &robotsEntry{fetchedAt: time.Now()}
scheme := "https"
if strings.HasPrefix(exampleURL, "http://") {
scheme = "http"
}
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
client := &http.Client{Timeout: 5 * time.Second}
req, _ := http.NewRequest("GET", robotsURL, nil)
req.Header.Set("User-Agent", f.userAgent)
resp, err := client.Do(req)
if err != nil || resp.StatusCode != 200 {
return entry // allow all if robots.txt unavailable
}
defer resp.Body.Close()
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
if err != nil {
return entry
}
entry.rules = parseRobots(string(body))
return entry
}
// parseRobots is a minimal robots.txt parser.
func parseRobots(content string) []robotsRule {
var rules []robotsRule
var current *robotsRule
for _, line := range strings.Split(content, "\n") {
line = strings.TrimSpace(line)
if idx := strings.Index(line, "#"); idx >= 0 {
line = line[:idx]
}
if line == "" {
if current != nil {
rules = append(rules, *current)
current = nil
}
continue
}
parts := strings.SplitN(line, ":", 2)
if len(parts) != 2 {
continue
}
key := strings.TrimSpace(strings.ToLower(parts[0]))
val := strings.TrimSpace(parts[1])
switch key {
case "user-agent":
if current == nil {
current = &robotsRule{userAgent: val}
} else {
current.userAgent = val
}
case "disallow":
if current != nil {
current.disallow = append(current.disallow, val)
}
case "allow":
if current != nil {
current.allow = append(current.allow, val)
}
}
}
if current != nil {
rules = append(rules, *current)
}
return rules
}
// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
var reader io.Reader = r
if sizeLimit > 0 {
reader = io.LimitReader(r, int64(sizeLimit))
}
// Use golang.org/x/net/html/charset for auto-detection
utf8Reader, err := charset.NewReader(reader, contentType)
if err != nil {
// Fall back to reading raw and hoping for UTF-8
data, readErr := io.ReadAll(reader)
if readErr != nil {
return "", readErr
}
return string(data), nil
}
data, err := io.ReadAll(utf8Reader)
if err != nil {
return "", err
}
return string(data), nil
}
+19
View File
@@ -0,0 +1,19 @@
module sese-engine
go 1.21
require (
github.com/andybalholm/brotli v1.1.0
github.com/pemistahl/lingua-go v1.4.0
github.com/yanyiwu/gojieba v1.4.4
go.etcd.io/bbolt v1.3.9
golang.org/x/net v0.23.0
)
require (
github.com/shopspring/decimal v1.3.1 // indirect
golang.org/x/exp v0.0.0-20221106115401-f9659909a136 // indirect
golang.org/x/sys v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
google.golang.org/protobuf v1.31.0 // indirect
)
+36
View File
@@ -0,0 +1,36 @@
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg=
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
github.com/pemistahl/lingua-go v1.4.0 h1:ifYhthrlW7iO4icdubwlduYnmwU37V1sbNrwhKBR4rM=
github.com/pemistahl/lingua-go v1.4.0/go.mod h1:ECuM1Hp/3hvyh7k8aWSqNCPlTxLemFZsRjocUf3KgME=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
github.com/yanyiwu/gojieba v1.4.4 h1:Iukkf8WlIfqAKtsGZjUhGR1ArKa7DtLDNmW8bvUI8JI=
github.com/yanyiwu/gojieba v1.4.4/go.mod h1:JUq4DddFVGdHXJHxxepxRmhrKlDpaBxR8O28v6fKYLY=
go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI=
go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE=
golang.org/x/exp v0.0.0-20221106115401-f9659909a136 h1:Fq7F/w7MAa1KJ5bt2aJ62ihqp9HDcRuyILskkpIAurw=
golang.org/x/exp v0.0.0-20221106115401-f9659909a136/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+327
View File
@@ -0,0 +1,327 @@
// Package harvester implements the index-writing server (port 5000).
//
// It receives (url, keywords) payloads from the crawler, accumulates them in
// memory, then flushes to the persistent inverted index when the in-memory
// row count exceeds the configured threshold.
package harvester
import (
"encoding/json"
"log"
"math/rand"
"net/http"
"strings"
"sync"
"sync/atomic"
"sese-engine/config"
"sese-engine/info"
"sese-engine/storage"
)
// Server is the harvester HTTP server.
type Server struct {
db *storage.DB
// in-memory accumulator: keyword → [(weight, url)]
mem map[string][]storage.IndexEntry
memMu sync.Mutex
rowCount int64 // approximate total in-memory rows
flushMu sync.Mutex // only one flush at a time
infoSvc *info.Service
}
// New creates a harvester Server.
func New(db *storage.DB, infoSvc *info.Service) *Server {
return &Server{
db: db,
mem: make(map[string][]storage.IndexEntry),
infoSvc: infoSvc,
}
}
// ingestPayload is the JSON body sent by the crawler.
type ingestPayload struct {
URL string `json:"url"`
Keywords []struct {
Word string `json:"word"`
Weight float32 `json:"weight"`
} `json:"keywords"`
}
// Handler returns the http.Handler for the harvester.
func (s *Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/l", s.handleIngest)
return mux
}
func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
if r.Method != http.MethodPost {
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
return
}
var payload ingestPayload
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
http.Error(w, "bad json: "+err.Error(), http.StatusBadRequest)
return
}
// Sanitise URL
payload.URL = strings.ReplaceAll(payload.URL, "\n", "")
if payload.URL == "" {
http.Error(w, "empty url", http.StatusBadRequest)
return
}
s.memMu.Lock()
for _, kw := range payload.Keywords {
key := kw.Word
entries := s.mem[key]
// Threshold-based early discard
if len(entries) > 15 {
low := s.lowThreshold(key)
if float64(kw.Weight) < low {
continue
}
}
s.mem[key] = append(entries, storage.IndexEntry{
Weight: kw.Weight,
URL: payload.URL,
})
atomic.AddInt64(&s.rowCount, 1)
}
s.memMu.Unlock()
// Check if we should flush
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
go s.flush()
}
w.Write([]byte("ok"))
}
// lowThreshold returns the minimum weight needed to enter the index for key.
func (s *Server) lowThreshold(key string) float64 {
existing, _ := s.db.GetIndex(key)
if len(existing) < config.MaxURLsPerKey {
return -1
}
// Find the config.MaxURLsPerKey-th highest weight
weights := make([]float64, len(existing))
for i, e := range existing {
weights[i] = float64(e.Weight)
}
// Partial sort: find threshold at position MaxURLsPerKey-1
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
}
// flush merges the in-memory accumulator into the persistent index.
func (s *Server) flush() {
if !s.flushMu.TryLock() {
return // another flush is running
}
defer s.flushMu.Unlock()
s.memMu.Lock()
snapshot := s.mem
s.mem = make(map[string][]storage.IndexEntry)
atomic.StoreInt64(&s.rowCount, 0)
s.memMu.Unlock()
log.Printf("[harvester] flushing %d keys", len(snapshot))
items := make([]struct {
key string
entries []storage.IndexEntry
}, 0, len(snapshot))
for k, v := range snapshot {
items = append(items, struct {
key string
entries []storage.IndexEntry
}{k, v})
}
rand.Shuffle(len(items), func(i, j int) { items[i], items[j] = items[j], items[i] })
// Parallel merge
type result struct {
key string
entries []storage.IndexEntry
}
results := make(chan result, len(items))
sem := make(chan struct{}, 8)
for _, item := range items {
sem <- struct{}{}
go func(k string, newEntries []storage.IndexEntry) {
defer func() { <-sem }()
merged := s.mergeKey(k, newEntries)
results <- result{k, merged}
}(item.key, item.entries)
}
// Collect
batch := make(map[string][]storage.IndexEntry, len(items))
for range items {
r := <-results
batch[r.key] = r.entries
}
if err := s.db.BatchSetIndex(batch); err != nil {
log.Printf("[harvester] flush write error: %v", err)
}
log.Printf("[harvester] flush done, %d keys written", len(batch))
}
// mergeKey merges new entries with existing index entries for a key.
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
existing, _ := s.db.GetIndex(key)
// Discard new key if too few URLs
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
return nil
}
merged := dedup(append(newEntries, existing...))
// Occasional URL normalisation dedup
if rand.Float64() < 0.02 {
merged = dedupNormalised(merged)
}
// Trim if over limit
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
}
return merged
}
// ---- helpers ----
func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
seen := make(map[string]bool, len(entries))
out := make([]storage.IndexEntry, 0, len(entries))
for _, e := range entries {
if seen[e.URL] {
continue
}
seen[e.URL] = true
out = append(out, e)
}
return out
}
func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
// Sort by URL length descending, then dedup by normalised URL (strip scheme, trailing slash)
sorted := make([]storage.IndexEntry, len(entries))
copy(sorted, entries)
for i := 0; i < len(sorted)-1; i++ {
for j := i + 1; j < len(sorted); j++ {
if len(sorted[j].URL) > len(sorted[i].URL) {
sorted[i], sorted[j] = sorted[j], sorted[i]
}
}
}
seen := make(map[string]bool)
out := make([]storage.IndexEntry, 0, len(sorted))
for _, e := range sorted {
k := normaliseURL(e.URL)
if seen[k] {
continue
}
seen[k] = true
out = append(out, e)
}
return out
}
func normaliseURL(u string) string {
if strings.HasPrefix(u, "https://") {
u = u[8:]
} else if strings.HasPrefix(u, "http://") {
u = u[7:]
}
return strings.TrimRight(u, "/")
}
// trim reduces entries to at most limit, keeping at most sameDomainLimit per domain.
func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomainLimit int) []storage.IndexEntry {
// Sort by effective score: weight * (1 + backlink)
scored := make([]storage.IndexEntry, len(entries))
copy(scored, entries)
for i := 0; i < len(scored)-1; i++ {
for j := i + 1; j < len(scored); j++ {
si := float64(scored[i].Weight) * (1 + infoSvc.Prosper(scored[i].URL))
sj := float64(scored[j].Weight) * (1 + infoSvc.Prosper(scored[j].URL))
if sj > si {
scored[i], scored[j] = scored[j], scored[i]
}
}
}
// Per-domain cap
domainCount := make(map[string]int)
out := make([]storage.IndexEntry, 0, limit)
for _, e := range scored {
host := netloc(e.URL)
if host == "" {
host = e.URL
}
host = strings.ToLower(host)
// Allow homepage URLs regardless of limit
isHome := isHomepage(e.URL)
if !isHome && domainCount[host] >= sameDomainLimit {
continue
}
domainCount[host]++
out = append(out, e)
if len(out) >= limit {
break
}
}
return out
}
func isHomepage(u string) bool {
u = strings.TrimPrefix(u, "https://")
u = strings.TrimPrefix(u, "http://")
return strings.Count(strings.TrimRight(u, "/"), "/") == 0
}
func netloc(rawURL string) string {
parts := strings.SplitN(rawURL, "/", 4)
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
return parts[2]
}
return ""
}
// nthLargest returns the n-th largest value in a slice (0-indexed).
func nthLargest(values []float64, n int) float64 {
if n >= len(values) {
return 0
}
cp := make([]float64, len(values))
copy(cp, values)
// Partial sort descending
for i := 0; i <= n; i++ {
maxIdx := i
for j := i + 1; j < len(cp); j++ {
if cp[j] > cp[maxIdx] {
maxIdx = j
}
}
cp[i], cp[maxIdx] = cp[maxIdx], cp[i]
}
return cp[n]
}
// ListenAndServe starts the harvester on the given address.
func (s *Server) ListenAndServe(addr string) error {
log.Printf("[harvester] listening on %s", addr)
return http.ListenAndServe(addr, s.Handler())
}
+206
View File
@@ -0,0 +1,206 @@
// Package info loads and serves auxiliary data: backlink scores, adjustment
// table, and blocked query words.
package info
import (
"encoding/json"
"math"
"os"
"path/filepath"
"strings"
"sync"
)
// Service loads the prosperity map, adjustment table, and blocked words.
type Service struct {
mu sync.RWMutex
prosperMap map[string]float64 // normalised backlink scores
adjustTable map[string]float64 // per-domain manual weight adjustments
blockedWords map[string]bool
storagePath string
}
// New creates and loads the info service from storagePath.
func New(storagePath string) *Service {
s := &Service{storagePath: storagePath}
s.Reload()
return s
}
// Reload re-reads all data files from disk.
func (s *Service) Reload() {
s.mu.Lock()
defer s.mu.Unlock()
s.prosperMap = loadProsperMap(s.storagePath)
s.adjustTable = loadAdjustTable()
s.blockedWords = loadBlockedWords()
}
// Prosper returns the backlink score for a URL (sum of its path components).
func (s *Service) Prosper(rawURL string) float64 {
s.mu.RLock()
defer s.mu.RUnlock()
return prosperFor(rawURL, s.prosperMap)
}
// ProsperMap returns the full prosperity map (read-only snapshot).
func (s *Service) ProsperMap() map[string]float64 {
s.mu.RLock()
defer s.mu.RUnlock()
out := make(map[string]float64, len(s.prosperMap))
for k, v := range s.prosperMap {
out[k] = v
}
return out
}
// Adjust returns the manual weight multiplier for a hostname (default 1.0).
func (s *Service) Adjust(host string) float64 {
s.mu.RLock()
defer s.mu.RUnlock()
if v, ok := s.adjustTable[host]; ok {
return v
}
return 1.0
}
// IsBlocked returns true if the word is in the blocked list.
func (s *Service) IsBlocked(word string) bool {
s.mu.RLock()
defer s.mu.RUnlock()
return s.blockedWords[word]
}
// ---- loaders ----
const backlinkBaseline = 200000.0
func loadProsperMap(storagePath string) map[string]float64 {
path := filepath.Join(storagePath, "prosper.json")
f, err := os.Open(path)
if err != nil {
return map[string]float64{}
}
defer f.Close()
var raw map[string]float64
if err := json.NewDecoder(f).Decode(&raw); err != nil {
return map[string]float64{}
}
return normalise(raw)
}
func normalise(d map[string]float64) map[string]float64 {
total := 0.0
for k, v := range d {
if !strings.Contains(k, "/") {
total += v
}
}
if total == 0 {
return d
}
factor := backlinkBaseline / total
out := make(map[string]float64, len(d))
for k, v := range d {
out[k] = v * factor
}
// Propagate max score up the domain tree
for k, v := range out {
now := k
for {
idx := strings.Index(now, ".")
if idx < 0 {
break
}
now = now[idx+1:]
if cur, ok := out[now]; ok && cur < v {
out[now] = v
} else if !ok {
break
}
}
}
return out
}
func loadAdjustTable() map[string]float64 {
// Try loading from data/adjust.json — fallback if absent
f, err := os.Open(filepath.Join("data", "adjust.json"))
if err != nil {
return map[string]float64{}
}
defer f.Close()
var m map[string]float64
json.NewDecoder(f).Decode(&m)
return m
}
func loadBlockedWords() map[string]bool {
f, err := os.Open(filepath.Join("data", "blocked_words.json"))
if err != nil {
return map[string]bool{}
}
defer f.Close()
var words []string
json.NewDecoder(f).Decode(&words)
m := make(map[string]bool, len(words))
for _, w := range words {
m[w] = true
}
return m
}
// prosperFor computes the prosperity score for a URL by decomposing it.
func prosperFor(rawURL string, pm map[string]float64) float64 {
segments := decomposeURL(rawURL)
s := 0.0
for _, seg := range segments {
t, ok := pm[seg]
if !ok {
t = 0
}
l := 0.0
if t > 0 {
l = math.Log2(2+t*2) - 1
}
if s == 0 {
if l == 0 {
return 0
}
s = l
} else {
s = l + math.Log((s-l)/2+1)
}
}
if s > 0 {
return 0.1 + s
}
return 0
}
// decomposeURL yields "domain.tld", "domain.tld/path", "domain.tld/path/sub", ...
func decomposeURL(rawURL string) []string {
u := strings.ToLower(rawURL)
if strings.HasPrefix(u, "https://") {
u = u[8:]
} else if strings.HasPrefix(u, "http://") {
u = u[7:]
} else {
return nil
}
u = strings.ReplaceAll(u, "?", "/")
u = strings.ReplaceAll(u, "#", "/")
u = strings.TrimRight(u, "/")
if u == "" || u[0] == '/' || u[0] == '%' || u[0] == ' ' {
return nil
}
parts := strings.Split(u, "/")
var out []string
current := parts[0]
out = append(out, current)
for _, p := range parts[1:] {
current = current + "/" + p
out = append(out, current)
}
return out
}
+90
View File
@@ -0,0 +1,90 @@
// sese-engine — Go rewrite
//
// All modules (harvester, search server, crawler, backlink calculator) are
// launched as goroutines from this single binary. The binary blocks until
// interrupted (Ctrl-C / SIGTERM).
//
// Usage:
//
// cd golang && go run . [--storage ./savedata] [--entry https://zh.wikipedia.org/]
package main
import (
"flag"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"sese-engine/analyzer"
"sese-engine/backlink"
"sese-engine/config"
"sese-engine/crawler"
"sese-engine/harvester"
"sese-engine/info"
"sese-engine/search"
"sese-engine/storage"
)
func main() {
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL")
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
flag.Parse()
log.SetFlags(log.LstdFlags | log.Lshortfile)
log.Printf("sese-engine starting storage=%s entry=%s", *storageDir, *entryURL)
// ---- 1. Storage ----
db, err := storage.Open(*storageDir)
if err != nil {
log.Fatalf("failed to open storage: %v", err)
}
defer db.Close()
// ---- 2. Info service ----
infoSvc := info.New(*storageDir)
// ---- 3. Analyzer ----
// modelPath is unused (lingua-go uses built-in language models, no external file needed)
anal, err := analyzer.New("", *stopWords)
if err != nil {
log.Fatalf("failed to init analyzer: %v", err)
}
defer anal.Close()
// ---- 4. Harvester (index write server on :5000) ----
harvSrv := harvester.New(db, infoSvc)
go func() {
if err := harvSrv.ListenAndServe(":5000"); err != nil {
log.Fatalf("[harvester] fatal: %v", err)
}
}()
// ---- 5. Search server ----
searchSrv := search.New(db, infoSvc, anal)
go func() {
addr := fmt.Sprintf(":%d", config.SearchServerPort)
if err := searchSrv.ListenAndServe(addr); err != nil {
log.Fatalf("[search] fatal: %v", err)
}
}()
// ---- 6. Backlink calculator (runs every 48 h) ----
bl := backlink.New(db, *storageDir)
go bl.Run()
// ---- 7. Crawler ----
prosperMap := infoSvc.ProsperMap()
crawl := crawler.New(db, anal, prosperMap)
go crawl.Run(*entryURL, config.MaxEpoch)
log.Println("all modules started — press Ctrl-C to stop")
// ---- Graceful shutdown ----
quit := make(chan os.Signal, 1)
signal.Notify(quit, os.Interrupt, syscall.SIGTERM)
<-quit
log.Println("shutdown signal received, exiting...")
}
+153
View File
@@ -0,0 +1,153 @@
// Package parser extracts title, description, text content, and links from HTML.
package parser
import (
"path"
"regexp"
"strings"
"golang.org/x/net/html"
)
var wsRe = regexp.MustCompile(`\s+`)
// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
// Determine base scheme+host
base := baseFromURL(baseURL)
basePath := pathFromURL(baseURL)
doc, err := html.Parse(strings.NewReader(body))
if err != nil {
return
}
var textParts []string
var dfs func(n *html.Node)
dfs = func(n *html.Node) {
if n.Type == html.ElementNode {
tag := strings.ToLower(n.Data)
if tag == "script" || tag == "style" || tag == "svg" {
return
}
if tag == "meta" {
name := ""
content := ""
for _, a := range n.Attr {
switch strings.ToLower(a.Key) {
case "name":
name = strings.ToLower(a.Val)
case "content":
content = a.Val
}
}
if name == "description" && description == "" {
description = content
}
}
if tag == "a" {
href := attrVal(n, "href")
if href != "" {
href = strings.SplitN(href, "#", 2)[0]
if href != "" {
href = resolveURL(base, basePath, href)
if href != "" {
hrefs = append(hrefs, href)
}
}
}
}
}
if n.Type == html.TextNode && n.Parent != nil {
parentTag := ""
if n.Parent.Type == html.ElementNode {
parentTag = strings.ToLower(n.Parent.Data)
}
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
goto children
}
s := wsRe.ReplaceAllString(n.Data, " ")
s = strings.TrimSpace(s)
if s != "" {
if parentTag == "title" {
title = s
} else {
textParts = append(textParts, s)
}
}
}
children:
for c := n.FirstChild; c != nil; c = c.NextSibling {
dfs(c)
}
}
dfs(doc)
text = strings.Join(textParts, " ")
return
}
func attrVal(n *html.Node, key string) string {
for _, a := range n.Attr {
if strings.ToLower(a.Key) == key {
return a.Val
}
}
return ""
}
func baseFromURL(rawURL string) string {
idx := strings.Index(rawURL, "://")
if idx < 0 {
return ""
}
rest := rawURL[idx+3:]
slash := strings.Index(rest, "/")
if slash < 0 {
return rawURL
}
return rawURL[:idx+3+slash]
}
func pathFromURL(rawURL string) string {
idx := strings.Index(rawURL, "://")
if idx < 0 {
return "/"
}
rest := rawURL[idx+3:]
slash := strings.Index(rest, "/")
if slash < 0 {
return "/"
}
p := rest[slash:]
// strip query/fragment
p = strings.SplitN(p, "?", 2)[0]
p = strings.SplitN(p, "#", 2)[0]
return p
}
func resolveURL(base, basePath, href string) string {
// Absolute URL
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
return href
}
// Protocol-relative
if strings.HasPrefix(href, "//") {
// extract scheme from base
idx := strings.Index(base, "://")
if idx < 0 {
return ""
}
return base[:idx+1] + href
}
// Absolute path
if strings.HasPrefix(href, "/") {
return base + href
}
// Relative path
dir := path.Dir(basePath)
return base + path.Clean(dir+"/"+href)
}
+693
View File
@@ -0,0 +1,693 @@
// Package search implements the user-facing search HTTP server.
package search
import (
"container/heap"
"encoding/json"
"log"
"math"
"net/http"
"net/url"
"regexp"
"sort"
"strings"
"sync"
"time"
"sese-engine/analyzer"
"sese-engine/config"
"sese-engine/info"
"sese-engine/parser"
"sese-engine/storage"
)
// Server is the search HTTP server.
type Server struct {
db *storage.DB
infoSvc *info.Service
analyzer *analyzer.Analyzer
httpCli *http.Client // for online snippet fetching
}
// New creates a search Server.
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
return &Server{
db: db,
infoSvc: infoSvc,
analyzer: a,
httpCli: &http.Client{
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
},
}
}
// Handler returns the http.Handler.
func (s *Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/search", s.handleSearch)
return mux
}
// ListenAndServe starts the search server.
func (s *Server) ListenAndServe(addr string) error {
log.Printf("[search] listening on %s", addr)
return http.ListenAndServe(addr, s.Handler())
}
// ---- search handler ----
type searchResponse struct {
Tokens []string `json:"tokens"`
Counts map[string]int `json:"counts"`
Results []searchResult `json:"results"`
Total int `json:"total"`
}
type searchResult struct {
Score float64 `json:"score"`
URL string `json:"url"`
Snippet *snippetInfo `json:"snippet,omitempty"`
Relevance map[string]float64 `json:"relevance"`
DomainCount int `json:"domain_count"`
Factors map[string]float64 `json:"factors,omitempty"`
}
type snippetInfo struct {
Title string `json:"title"`
Description string `json:"description"`
Text string `json:"text"`
}
var siteRe = regexp.MustCompile(`^site:(.+)$`)
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
q := r.URL.Query().Get("q")
if q == "" {
if qh := r.URL.Query().Get("qh"); qh != "" {
decoded, err := url.PathUnescape(qh)
if err == nil {
q = decoded
}
}
}
// Parse slice param "0:10"
sliceStr := r.URL.Query().Get("slice")
sliceFrom, sliceTo := 0, 10
if sliceStr != "" {
parts := strings.SplitN(sliceStr, ":", 2)
if len(parts) == 2 {
a := atoi(parts[0])
b := atoi(parts[1])
if a >= 0 && b > a && b-a <= 20 {
sliceFrom, sliceTo = a, b
}
}
}
// Parse tokens and site filter
var tokens []string
var siteFilter string
for _, part := range strings.Fields(q) {
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
siteFilter = m[1]
} else {
segs := s.analyzer.Segment(part, false)
for _, t := range segs {
if !s.infoSvc.IsBlocked(t) {
tokens = append(tokens, t)
}
}
}
}
if len(tokens) > 20 {
tokens = tokens[:20]
}
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
// Count per keyword
counts := make(map[string]int, len(tokens))
for _, t := range tokens {
entries, _ := s.db.GetIndex(t)
counts[t] = len(entries)
}
resp := searchResponse{
Tokens: tokens,
Counts: counts,
Results: results,
Total: total,
}
json.NewEncoder(w).Encode(resp)
}
// query executes the multi-keyword search and returns ranked results.
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
if len(tokens) == 0 {
return nil, 0
}
// Load inverted index for each token
type tokenIndex struct {
token string
entries []storage.IndexEntry
defVal float64
}
tokenIndexes := make([]tokenIndex, 0, len(tokens))
for _, t := range tokens {
entries, _ := s.db.GetIndex(t)
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
if len(entries) >= config.MaxURLsPerKey {
weights := make([]float64, len(entries))
for i, e := range entries {
weights[i] = float64(e.Weight)
}
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
}
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
}
// Build URL → per-token weights map
urlWeights := make(map[string]map[string]float64)
for _, ti := range tokenIndexes {
for _, e := range ti.entries {
if urlWeights[e.URL] == nil {
urlWeights[e.URL] = make(map[string]float64)
}
urlWeights[e.URL][ti.token] = float64(e.Weight)
}
}
// Site filter
total := len(urlWeights)
if siteFilter != "" {
filtered := make(map[string]map[string]float64)
for u, vs := range urlWeights {
h := netloc(u)
if matchSite(h, siteFilter) {
filtered[u] = vs
}
}
urlWeights = filtered
total = len(urlWeights)
}
// Build default value map
defVals := make(map[string]float64, len(tokenIndexes))
for _, ti := range tokenIndexes {
defVals[ti.token] = ti.defVal
}
// Compute relevance + initial score for each URL
candidates := make([]candidate, 0, len(urlWeights))
for u, vs := range urlWeights {
rel := 1.0
for _, ti := range tokenIndexes {
vp := vs[ti.token]
if vp == 0 {
vp = defVals[ti.token]
}
if vp > 0.06 {
vp = math.Log((vp-0.06)*40+1)/40 + 0.06
}
rel *= vp
}
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
bad := badURL(u)
adjust := s.infoSvc.Adjust(netloc(u))
score := rel * prosper * (1 - bad) * adjust * 0.1
var vec [12]float64
vec[0] = score
vec[1] = rel
vec[2] = prosper
vec[3] = 1 - bad
vec[4] = 1 // language multiplier placeholder
vec[5] = 1 // repetition placeholder
vec[6] = adjust
vec[7] = 1 // time multiplier placeholder
vec[8] = 1 // consecutive keyword placeholder
vec[9] = 1 // keyword content placeholder
vec[10] = 1 // URL time placeholder
vec[11] = 0.1
candidates = append(candidates, candidate{u, rel, vec})
}
// Early relevance threshold
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
})
// Apply site info factors to top 256
now := time.Now().Unix()
limit256 := 256
if len(candidates) < 256 {
limit256 = len(candidates)
}
var wg sync.WaitGroup
for i := 0; i < limit256; i++ {
wg.Add(1)
go func(idx int) {
defer wg.Done()
c := &candidates[idx]
h := netloc(c.url)
siteInfo, _ := s.db.GetSiteInfo(h)
langMul := languageMultiplier(siteInfo)
timeMul := timeMul(siteInfo, now)
urlTimeMul := urlTimeMul(s.db, c.url, now)
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
c.scoreVec[4] = langMul
c.scoreVec[7] = timeMul
c.scoreVec[10] = urlTimeMul
}(i)
}
wg.Wait()
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
})
// Apply consecutive-keyword and repetition bonuses to top 80
limit80 := 80
if len(candidates) < 80 {
limit80 = len(candidates)
}
titles := make([]string, limit80)
for i := 0; i < limit80; i++ {
if snippet, err := s.db.GetSnippet(candidates[i].url); err == nil {
titles[i] = snippet.Title
}
}
// Repetition penaliser
for i := 0; i < limit80; i++ {
h := repetitionSimilarity(titles, i)
consecutive := consecutiveCount(titles[i], tokens)
repMul := 1.0
if h > 0.5 {
repMul = 1 - (h - 0.5)
}
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
candidates[i].scoreVec[0] *= repMul * consMul
candidates[i].scoreVec[5] = repMul
candidates[i].scoreVec[8] = consMul
}
sort.Slice(candidates, func(i, j int) bool {
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
})
// Re-rank: interleave domains
reranked := rerank(candidates, from, to)
// Fetch snippets and build output
results := make([]searchResult, 0, len(reranked))
var snippetMu sync.Mutex
var snippetWg sync.WaitGroup
for _, c := range reranked {
snippetWg.Add(1)
go func(cand candidate) {
defer snippetWg.Done()
snip := s.getSnippet(cand.url)
r := searchResult{
Score: cand.scoreVec[0],
URL: unescapeURL(cand.url),
Snippet: snip,
Relevance: make(map[string]float64),
DomainCount: 0,
Factors: map[string]float64{
"relevance": cand.scoreVec[1],
"backlink": cand.scoreVec[2],
"url_quality": cand.scoreVec[3],
"language": cand.scoreVec[4],
"repetition": cand.scoreVec[5],
"adjust": cand.scoreVec[6],
"site_time": cand.scoreVec[7],
"consecutive": cand.scoreVec[8],
"url_time": cand.scoreVec[10],
},
}
for _, ti := range tokenIndexes {
r.Relevance[ti.token] = urlWeights[cand.url][ti.token]
}
snippetMu.Lock()
results = append(results, r)
snippetMu.Unlock()
}(c)
}
snippetWg.Wait()
// Preserve order (goroutines may reorder)
urlOrder := make(map[string]int)
for i, c := range reranked {
urlOrder[c.url] = i
}
sort.Slice(results, func(i, j int) bool {
return urlOrder[results[i].URL] < urlOrder[results[j].URL]
})
return results, total
}
// getSnippet fetches (or caches) a snippet for a URL.
func (s *Server) getSnippet(rawURL string) *snippetInfo {
// Try cache first
if entry, err := s.db.GetSnippet(rawURL); err == nil {
snip := buildSnippet(entry)
return snip
}
if !config.UseOnlineSnippet {
return nil
}
// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
req, err := http.NewRequest("GET", rawURL, nil)
if err != nil {
return nil
}
req.Header.Set("User-Agent", config.SpiderName)
resp, err := s.httpCli.Do(req)
if err != nil || resp.StatusCode != 200 {
return nil
}
defer resp.Body.Close()
ct := resp.Header.Get("Content-Type")
if !strings.Contains(ct, "text/html") {
return nil
}
body := readBodyLimited(resp, 60000)
title, desc, text, _ := parser.ParseHTML(body, resp.Request.URL.String())
entry := &storage.SnippetEntry{
Title: title,
Description: truncate(desc, 256),
Text: truncate(text, 256),
Timestamp: time.Now().Unix(),
}
_ = s.db.SetSnippet(rawURL, entry)
return buildSnippet(entry)
}
func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
if entry == nil || (entry.Title == "" && entry.Description == "" && entry.Text == "") {
return nil
}
return &snippetInfo{
Title: entry.Title,
Description: entry.Description,
Text: entry.Text,
}
}
// ---- scoring helpers ----
func languageMultiplier(si *storage.SiteInfo) float64 {
if si == nil || len(si.Languages) == 0 {
return 1.0
}
total := 0.0
for _, v := range si.Languages {
total += v
}
chinese := si.Languages["zh"] / total
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
}
func timeMul(si *storage.SiteInfo, now int64) float64 {
if si == nil {
return 1.0
}
t := si.LastVisitTime
if t == 0 {
t = 1648000000
}
days := (now - t) / (3600 * 24)
if days < 0 {
days = 0
}
if days > 180 {
days = 180
}
if days > 0 {
days--
}
return math.Pow(config.WeightDailyDecay, float64(days))
}
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
entry, err := db.GetSnippet(rawURL)
if err != nil || entry == nil {
return 1.0
}
days := (now - entry.Timestamp) / (3600 * 24)
if days <= 30 {
return 1.0
}
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
}
func badURL(u string) float64 {
s := math.Max(0, float64(len(u)-30)/200.0)
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
s += (1 - s) * 0.3
}
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
s += (1 - s) * 0.1
}
if len(u) < 5 || u[4] == ':' {
s += (1 - s) * 0.3
}
return math.Min(s, 0.9)
}
func netloc(rawURL string) string {
parts := strings.SplitN(rawURL, "/", 4)
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
return parts[2]
}
return rawURL
}
func matchSite(host, pattern string) bool {
if host == pattern {
return true
}
if strings.HasSuffix(host, "."+pattern) {
return true
}
return false
}
func consecutiveCount(title string, tokens []string) int {
c := 0
for i := 0; i < len(tokens)-1; i++ {
if strings.Contains(title, tokens[i]+tokens[i+1]) {
c++
}
}
return c
}
func repetitionSimilarity(titles []string, idx int) float64 {
if idx == 0 {
return 0
}
t := titles[idx]
if t == "" {
return 0
}
best := 0.0
for _, prev := range titles[:idx] {
if prev == "" {
continue
}
sim := 1 - float64(levenshtein(t, prev))/float64(max(len(t), len(prev)))
if sim > best {
best = sim
}
}
return best
}
func levenshtein(a, b string) int {
ra := []rune(a)
rb := []rune(b)
la, lb := len(ra), len(rb)
if la == 0 {
return lb
}
if lb == 0 {
return la
}
prev := make([]int, lb+1)
curr := make([]int, lb+1)
for j := 0; j <= lb; j++ {
prev[j] = j
}
for i := 1; i <= la; i++ {
curr[0] = i
for j := 1; j <= lb; j++ {
cost := 1
if ra[i-1] == rb[j-1] {
cost = 0
}
curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost)
}
prev, curr = curr, prev
}
return prev[lb]
}
func min3(a, b, c int) int {
if a < b {
if a < c {
return a
}
return c
}
if b < c {
return b
}
return c
}
// rerank interleaves results from different domains.
type domainHeap []rerankItem
type rerankItem struct {
score float64
url string
domainMul float64
vec [12]float64
}
func (h domainHeap) Len() int { return len(h) }
func (h domainHeap) Less(i, j int) bool { return h[i].score*h[i].domainMul > h[j].score*h[j].domainMul }
func (h domainHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
func (h *domainHeap) Push(x interface{}) { *h = append(*h, x.(rerankItem)) }
func (h *domainHeap) Pop() interface{} {
old := *h
n := len(old)
x := old[n-1]
*h = old[:n-1]
return x
}
type candidate struct {
url string
relevance float64
scoreVec [12]float64
}
func rerank(candidates []candidate, from, to int) []candidate {
domainItems := make(map[string][]candidate)
for _, c := range candidates {
h := netloc(c.url)
domainItems[h] = append(domainItems[h], c)
}
h := &domainHeap{}
heap.Init(h)
domainMul := make(map[string]float64)
for domain, items := range domainItems {
domainMul[domain] = 1.0
// Sort items within domain
sort.Slice(items, func(i, j int) bool {
return items[i].scoreVec[0] < items[j].scoreVec[0]
})
top := items[len(items)-1]
domainItems[domain] = items[:len(items)-1]
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
}
var result []candidate
for h.Len() > 0 && len(result) < to {
item := heap.Pop(h).(rerankItem)
if len(result) >= from {
result = append(result, candidate{url: item.url, scoreVec: item.vec})
}
domain := netloc(item.url)
domainMul[domain] /= 8
remaining := domainItems[domain]
if len(remaining) > 0 {
next := remaining[len(remaining)-1]
domainItems[domain] = remaining[:len(remaining)-1]
heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
}
}
return result
}
// ---- misc ----
func readBodyLimited(resp *http.Response, limit int64) string {
data := make([]byte, 0, limit)
buf := make([]byte, 4096)
var total int64
for {
n, err := resp.Body.Read(buf)
if n > 0 {
data = append(data, buf[:n]...)
total += int64(n)
if total >= limit {
break
}
}
if err != nil {
break
}
}
return string(data)
}
func truncate(s string, n int) string {
if len(s) <= n {
return s
}
return s[:n]
}
func unescapeURL(u string) string {
decoded, err := url.PathUnescape(u)
if err != nil {
return u
}
return decoded
}
func atoi(s string) int {
n := 0
for _, c := range s {
if c < '0' || c > '9' {
return n
}
n = n*10 + int(c-'0')
}
return n
}
func max(a, b int) int {
if a > b {
return a
}
return b
}
func min(a, b int) int {
if a < b {
return a
}
return b
}
+300
View File
@@ -0,0 +1,300 @@
// Package storage provides the persistent index and site-info storage backed by bbolt.
//
// Index space → a single bbolt bucket "index" where key = keyword (string),
// value = brotli-compressed JSON array of [weight, url] pairs.
//
// Gate (门) → a bbolt bucket "gate" where key = URL (string),
// value = brotli-compressed JSON array [title, desc, text, timestamp].
//
// SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string),
// value = brotli-compressed JSON of SiteInfo struct.
//
// The Python version used a custom hash-bucket scheme; here bbolt handles it natively.
package storage
import (
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"github.com/andybalholm/brotli"
bolt "go.etcd.io/bbolt"
)
// IndexEntry is a single entry in the inverted index.
type IndexEntry struct {
Weight float32 `json:"w"`
URL string `json:"u"`
}
// SnippetEntry is cached snippet data for a URL.
type SnippetEntry struct {
Title string `json:"title"`
Description string `json:"desc"`
Text string `json:"text"`
Timestamp int64 `json:"ts"`
}
var (
bucketIndex = []byte("index")
bucketGate = []byte("gate")
bucketSiteGate = []byte("site_gate")
)
// DB wraps a bbolt database and exposes typed access methods.
// bbolt handles its own locking internally.
type DB struct {
db *bolt.DB
}
// Open creates or opens the bbolt database at the given directory path.
func Open(dir string) (*DB, error) {
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
}
path := filepath.Join(dir, "sese.db")
db, err := bolt.Open(path, 0o600, nil)
if err != nil {
return nil, fmt.Errorf("storage.Open bolt: %w", err)
}
// Ensure buckets exist
err = db.Update(func(tx *bolt.Tx) error {
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
return err
}
}
return nil
})
if err != nil {
return nil, fmt.Errorf("storage.Open create buckets: %w", err)
}
return &DB{db: db}, nil
}
// Close closes the underlying bbolt database.
func (d *DB) Close() error {
return d.db.Close()
}
// ---- helpers ----
func compress(data []byte) ([]byte, error) {
buf := make([]byte, 0, len(data))
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6)
if _, err := w.Write(data); err != nil {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
return buf, nil
}
func decompress(data []byte) ([]byte, error) {
r := brotli.NewReader(
(*byteReader)(&data),
)
out := make([]byte, 0, len(data)*3)
tmp := make([]byte, 4096)
for {
n, err := r.Read(tmp)
out = append(out, tmp[:n]...)
if err != nil {
if err == io.EOF {
break
}
return out, err
}
}
return out, nil
}
// appendWriter implements io.Writer on top of a *[]byte.
type appendWriter []byte
func (a *appendWriter) Write(p []byte) (int, error) {
*a = append(*a, p...)
return len(p), nil
}
// byteReader wraps []byte as io.Reader.
type byteReader []byte
func (b *byteReader) Read(p []byte) (int, error) {
if len(*b) == 0 {
return 0, io.EOF
}
n := copy(p, *b)
*b = (*b)[n:]
return n, nil
}
func marshalCompress(v any) ([]byte, error) {
raw, err := json.Marshal(v)
if err != nil {
return nil, err
}
return compress(raw)
}
func decompressUnmarshal(data []byte, v any) error {
raw, err := decompress(data)
if err != nil {
return err
}
return json.Unmarshal(raw, v)
}
// ---- Index (inverted index) ----
// GetIndex retrieves all IndexEntry values for a keyword.
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
var entries []IndexEntry
err := d.db.View(func(tx *bolt.Tx) error {
b := tx.Bucket(bucketIndex)
v := b.Get([]byte(keyword))
if v == nil {
return nil
}
return decompressUnmarshal(v, &entries)
})
return entries, err
}
// SetIndex overwrites the IndexEntry list for a keyword.
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
data, err := marshalCompress(entries)
if err != nil {
return err
}
return d.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(bucketIndex).Put([]byte(keyword), data)
})
}
// BatchSetIndex writes multiple keyword→entries pairs in one transaction.
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
return d.db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket(bucketIndex)
for keyword, entries := range batch {
data, err := marshalCompress(entries)
if err != nil {
return err
}
if err := b.Put([]byte(keyword), data); err != nil {
return err
}
}
return nil
})
}
// ForEachIndex iterates over all index entries. fn receives keyword and entries.
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
return d.db.View(func(tx *bolt.Tx) error {
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
var entries []IndexEntry
if err := decompressUnmarshal(v, &entries); err != nil {
return nil // skip corrupted entries
}
return fn(string(k), entries)
})
})
}
// ---- Gate (URL snippet cache) ----
// GetSnippet retrieves the cached snippet for a URL.
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
var entry SnippetEntry
err := d.db.View(func(tx *bolt.Tx) error {
v := tx.Bucket(bucketGate).Get([]byte(url))
if v == nil {
return fmt.Errorf("not found")
}
return decompressUnmarshal(v, &entry)
})
if err != nil {
return nil, err
}
return &entry, nil
}
// SetSnippet stores a cached snippet for a URL.
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
data, err := marshalCompress(entry)
if err != nil {
return err
}
return d.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(bucketGate).Put([]byte(url), data)
})
}
// ---- SiteGate (site metadata) ----
// SiteInfo mirrors the Python 网站 dataclass.
type SiteInfo struct {
VisitCount int `json:"visit_count"`
LastVisitTime int64 `json:"last_visit_time"`
Fingerprint any `json:"fingerprint,omitempty"`
SuccessRate *float64 `json:"success_rate,omitempty"`
HTMLStructure string `json:"html_structure,omitempty"`
IPs []string `json:"ips,omitempty"`
Quality *float64 `json:"quality,omitempty"`
HTTPSAvailable *bool `json:"https_available,omitempty"`
Keywords []string `json:"keywords,omitempty"`
OutLinks []string `json:"out_links,omitempty"`
Languages map[string]float64 `json:"languages,omitempty"`
Redirects map[string]string `json:"redirects,omitempty"`
ServerTypes []string `json:"server_types,omitempty"`
}
// GetSiteInfo retrieves metadata for a hostname.
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
var info SiteInfo
err := d.db.View(func(tx *bolt.Tx) error {
v := tx.Bucket(bucketSiteGate).Get([]byte(host))
if v == nil {
return fmt.Errorf("not found")
}
return decompressUnmarshal(v, &info)
})
if err != nil {
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
}
if info.Languages == nil {
info.Languages = make(map[string]float64)
}
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
return &info, nil
}
// SetSiteInfo stores metadata for a hostname.
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
data, err := marshalCompress(info)
if err != nil {
return err
}
return d.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(bucketSiteGate).Put([]byte(host), data)
})
}
// ForEachSite iterates over all site metadata entries.
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
return d.db.View(func(tx *bolt.Tx) error {
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
var info SiteInfo
if err := decompressUnmarshal(v, &info); err != nil {
return nil
}
return fn(string(k), &info)
})
})
}