Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1 @@
|
||||
savedata
|
||||
@@ -0,0 +1,89 @@
|
||||
# sese-engine Go 重构版
|
||||
|
||||
Python 原版的 Go 语言重构,使用标准英文命名,单二进制部署。
|
||||
|
||||
## 目录结构
|
||||
|
||||
```
|
||||
golang/
|
||||
├── main.go # 主入口,goroutine 启动所有模块
|
||||
├── go.mod
|
||||
├── config/
|
||||
│ └── config.go # 全局配置参数(对应 配置.py)
|
||||
├── storage/
|
||||
│ └── storage.go # bbolt 持久化层(对应 存储.py,替换 rimo_storage)
|
||||
├── crawler/
|
||||
│ ├── crawler.go # BFS 爬虫调度(对应 上网.py)
|
||||
│ └── fetcher.go # HTTP 获取 + robots.txt + 限流(对应 虫.py)
|
||||
├── parser/
|
||||
│ └── parser.go # HTML 解析(对应 文.py)
|
||||
├── analyzer/
|
||||
│ └── analyzer.go # 分词 + 关键词权重(对应 分析.py + utils.py 分词部分)
|
||||
│ 使用 gojieba(中文)+ gofasttext(语言检测)
|
||||
├── harvester/
|
||||
│ └── harvester.go # 索引写入服务,监听 :5000(对应 收获服务器.py)
|
||||
├── search/
|
||||
│ └── server.go # 搜索 API,监听 :80(对应 人服务器.py)
|
||||
├── backlink/
|
||||
│ └── backlink.go # 反向链接计算,每 48h 运行(对应 回.py)
|
||||
└── info/
|
||||
└── info.go # 繁荣表 / 调整表 / 屏蔽词加载(对应 信息.py)
|
||||
```
|
||||
|
||||
## 依赖项
|
||||
|
||||
| Go 包 | 替代 Python 包 | 用途 |
|
||||
|-------|--------------|------|
|
||||
| `github.com/yanyiwu/gojieba` | `jieba` | 中文分词 |
|
||||
| `github.com/nicholasgasior/gofasttext` | `fasttext` | 语言检测 |
|
||||
| `go.etcd.io/bbolt` | `rimo_storage` | KV 存储 / 倒排索引 |
|
||||
| `github.com/andybalholm/brotli` | `brotli` | 压缩 |
|
||||
| `golang.org/x/net/html` | `lxml` | HTML 解析 |
|
||||
| `golang.org/x/net/html/charset` | chardet | 编码检测 |
|
||||
|
||||
## 构建与运行
|
||||
|
||||
```bash
|
||||
cd golang
|
||||
|
||||
# 下载依赖(需要 CGo 编译器,用于 gojieba / gofasttext)
|
||||
go mod tidy
|
||||
|
||||
# 构建
|
||||
go build -o sese-engine .
|
||||
|
||||
# 运行(在 sese-engine 项目根目录下)
|
||||
cd ..
|
||||
./golang/sese-engine \
|
||||
--storage ./savedata \
|
||||
--entry https://zh.wikipedia.org/ \
|
||||
--fasttext ./lid.176.ftz \
|
||||
--stopwords ./data/标点符号.json
|
||||
```
|
||||
|
||||
一个进程启动所有模块:
|
||||
- `:5000` — 收获服务器(爬虫推送关键词)
|
||||
- `:80` — 搜索 API(`GET /search?q=关键词`)
|
||||
- 后台 goroutine — BFS 爬虫
|
||||
- 后台 goroutine — 每 48 小时反向链接计算
|
||||
|
||||
## 与 Python 版的主要差异
|
||||
|
||||
| 方面 | Python 版 | Go 版 |
|
||||
|------|---------|-------|
|
||||
| 并发 | GIL + 线程池(假并发) | goroutine 真并发 |
|
||||
| 存储 | rimo_storage(自研)| bbolt(嵌入式 KV) |
|
||||
| 部署 | 需要 Python 环境 | 单二进制,无运行时依赖 |
|
||||
| 命名 | 全中文 | 标准英文 |
|
||||
| 进程数 | 3~4 个进程 | 1 个进程多 goroutine |
|
||||
| 编码检测 | requests 自动检测 | `golang.org/x/net/html/charset` |
|
||||
| Prometheus | 可选 | 暂未集成(可后续添加) |
|
||||
|
||||
## 注意事项
|
||||
|
||||
1. **CGo 依赖**:gojieba 和 gofasttext 均需要 C/C++ 编译器(gcc/clang)。
|
||||
Windows 下建议使用 MinGW 或 WSL。
|
||||
2. **fasttext 模型**:`lid.176.ftz` 需要与 Python 版共用,路径通过 `--fasttext` 指定。
|
||||
3. **数据迁移**:存储格式(bbolt JSON)与 Python 版(rimo_storage 二进制)不兼容,
|
||||
需要全新爬取,或编写迁移脚本。
|
||||
4. **stop words 文件**:复用 Python 版的 `data/标点符号.json`。
|
||||
@@ -0,0 +1,250 @@
|
||||
// Package analyzer provides keyword extraction and language detection.
|
||||
//
|
||||
// Keyword extraction uses gojieba for Chinese segmentation and simple token
|
||||
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
|
||||
package analyzer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"unicode"
|
||||
|
||||
"github.com/pemistahl/lingua-go"
|
||||
"github.com/yanyiwu/gojieba"
|
||||
)
|
||||
|
||||
// Keyword holds a (word, weight) pair.
|
||||
type Keyword struct {
|
||||
Word string `json:"word"`
|
||||
Weight float32 `json:"weight"`
|
||||
}
|
||||
|
||||
// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
|
||||
type Analyzer struct {
|
||||
jieba *gojieba.Jieba
|
||||
detector lingua.LanguageDetector
|
||||
stopWords map[string]bool
|
||||
mu sync.Mutex // gojieba is not goroutine-safe
|
||||
}
|
||||
|
||||
// New creates an Analyzer.
|
||||
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
|
||||
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
|
||||
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
||||
j := gojieba.NewJieba()
|
||||
|
||||
// Build a lingua detector that covers the languages we care about.
|
||||
// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
|
||||
detector := lingua.NewLanguageDetectorBuilder().
|
||||
FromAllLanguages().
|
||||
WithMinimumRelativeDistance(0.15).
|
||||
Build()
|
||||
|
||||
stopWords := loadStopWords(stopWordsPath)
|
||||
|
||||
return &Analyzer{
|
||||
jieba: j,
|
||||
detector: detector,
|
||||
stopWords: stopWords,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Close releases resources held by the analyzer.
|
||||
func (a *Analyzer) Close() {
|
||||
a.jieba.Free()
|
||||
}
|
||||
|
||||
// loadStopWords reads a JSON array of stop-word strings.
|
||||
func loadStopWords(path string) map[string]bool {
|
||||
if path == "" {
|
||||
return map[string]bool{}
|
||||
}
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
defer f.Close()
|
||||
var words []string
|
||||
if err := json.NewDecoder(f).Decode(&words); err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
m := make(map[string]bool, len(words))
|
||||
for _, w := range words {
|
||||
m[strings.ToLower(w)] = true
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
|
||||
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
|
||||
if len(s) > 10000 {
|
||||
s = s[:10000]
|
||||
}
|
||||
// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
|
||||
s = strings.ToValidUTF8(s, "")
|
||||
var result []string
|
||||
for _, part := range strings.Fields(s) {
|
||||
if isASCIIAlnum(part) {
|
||||
result = append(result, part)
|
||||
} else {
|
||||
a.mu.Lock()
|
||||
var tokens []string
|
||||
if searchMode {
|
||||
tokens = a.jieba.CutForSearch(part, true)
|
||||
} else {
|
||||
tokens = a.jieba.Cut(part, true)
|
||||
}
|
||||
a.mu.Unlock()
|
||||
result = append(result, tokens...)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
|
||||
func Normalize(s string) string {
|
||||
var b strings.Builder
|
||||
for _, r := range s {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
|
||||
if r >= 'A' && r <= 'Z' {
|
||||
b.WriteRune(unicode.ToLower(r))
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
|
||||
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
|
||||
tokens := a.Tokenize(text, false)
|
||||
d := make(map[string]float32)
|
||||
n := math.Max(8, float64(len(tokens)))
|
||||
counts := make(map[string]int)
|
||||
for _, t := range tokens {
|
||||
t = Normalize(t)
|
||||
if t == "" || a.stopWords[t] || len(t) > 32 {
|
||||
continue
|
||||
}
|
||||
counts[t]++
|
||||
}
|
||||
for k, v := range counts {
|
||||
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// Analyze extracts weighted keywords from title, description, and body text.
|
||||
// Returns a slice sorted by weight descending.
|
||||
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
|
||||
maps := []map[string]float32{
|
||||
a.weightedTokens(title, 1.0),
|
||||
a.weightedTokens(description, 0.5),
|
||||
a.weightedTokens(text, 1.0),
|
||||
}
|
||||
|
||||
combined := make(map[string]float32)
|
||||
for _, m := range maps {
|
||||
for k := range m {
|
||||
combined[k] = 0
|
||||
}
|
||||
}
|
||||
for k := range combined {
|
||||
for _, m := range maps {
|
||||
combined[k] += m[k]
|
||||
}
|
||||
}
|
||||
|
||||
result := make([]Keyword, 0, len(combined))
|
||||
for k, v := range combined {
|
||||
result = append(result, Keyword{Word: k, Weight: v})
|
||||
}
|
||||
|
||||
sortKeywords(result)
|
||||
return result
|
||||
}
|
||||
|
||||
// Segment returns search-mode tokens for a query string.
|
||||
func (a *Analyzer) Segment(query string, searchMode bool) []string {
|
||||
tokens := a.Tokenize(query, searchMode)
|
||||
var result []string
|
||||
for _, t := range tokens {
|
||||
t = Normalize(t)
|
||||
if t == "" || a.stopWords[t] || len(t) > 32 {
|
||||
continue
|
||||
}
|
||||
result = append(result, t)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
|
||||
// Returns "" for unknown or unsupported languages.
|
||||
var linguaToISO639 = map[lingua.Language]string{
|
||||
lingua.Chinese: "zh",
|
||||
lingua.English: "en",
|
||||
lingua.Japanese: "ja",
|
||||
lingua.Korean: "ko",
|
||||
lingua.French: "fr",
|
||||
lingua.German: "de",
|
||||
lingua.Spanish: "es",
|
||||
lingua.Portuguese: "pt",
|
||||
lingua.Italian: "it",
|
||||
lingua.Russian: "ru",
|
||||
lingua.Arabic: "ar",
|
||||
lingua.Hindi: "hi",
|
||||
lingua.Dutch: "nl",
|
||||
lingua.Polish: "pl",
|
||||
lingua.Swedish: "sv",
|
||||
lingua.Turkish: "tr",
|
||||
lingua.Vietnamese: "vi",
|
||||
lingua.Thai: "th",
|
||||
lingua.Indonesian: "id",
|
||||
lingua.Malay: "ms",
|
||||
}
|
||||
|
||||
// DetectLanguage returns the ISO 639-1 language code for the text, or "".
|
||||
func (a *Analyzer) DetectLanguage(text string) string {
|
||||
text = strings.ReplaceAll(text, "\n", " ")
|
||||
if len(text) > 2000 {
|
||||
text = text[:2000]
|
||||
}
|
||||
if text == "" {
|
||||
return ""
|
||||
}
|
||||
lang, exists := a.detector.DetectLanguageOf(text)
|
||||
if !exists {
|
||||
return ""
|
||||
}
|
||||
if code, ok := linguaToISO639[lang]; ok {
|
||||
return code
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ---- sorting ----
|
||||
|
||||
func sortKeywords(kws []Keyword) {
|
||||
for i := 1; i < len(kws); i++ {
|
||||
key := kws[i]
|
||||
j := i - 1
|
||||
for j >= 0 && kws[j].Weight < key.Weight {
|
||||
kws[j+1] = kws[j]
|
||||
j--
|
||||
}
|
||||
kws[j+1] = key
|
||||
}
|
||||
}
|
||||
|
||||
func isASCIIAlnum(s string) bool {
|
||||
for _, r := range s {
|
||||
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return len(s) > 0
|
||||
}
|
||||
@@ -0,0 +1,533 @@
|
||||
// Package backlink computes backlink (prosperity) scores for all known domains,
|
||||
// using a PageRank-like algorithm over the site-level link graph.
|
||||
//
|
||||
// It runs every 48 hours and writes savedata/prosper.json.
|
||||
package backlink
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"math/rand"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
// Runner runs the backlink calculation loop.
|
||||
type Runner struct {
|
||||
db *storage.DB
|
||||
storagePath string
|
||||
}
|
||||
|
||||
// New creates a Runner.
|
||||
func New(db *storage.DB, storagePath string) *Runner {
|
||||
return &Runner{db: db, storagePath: storagePath}
|
||||
}
|
||||
|
||||
// Run loops forever, recalculating every 48 hours.
|
||||
func (r *Runner) Run() {
|
||||
for {
|
||||
// Sleep until next scheduled run (aligned to 2am)
|
||||
now := time.Now()
|
||||
target := time.Date(now.Year(), now.Month(), now.Day(), 2, 0, 0, 0, now.Location())
|
||||
if !target.After(now) {
|
||||
target = target.Add(48 * time.Hour)
|
||||
}
|
||||
sleep := target.Sub(now)
|
||||
log.Printf("[backlink] next run at %v (in %v)", target.Format(time.RFC3339), sleep.Round(time.Minute))
|
||||
time.Sleep(sleep)
|
||||
|
||||
log.Printf("[backlink] starting computation at %v", time.Now().Format(time.RFC3339))
|
||||
if err := r.compute(); err != nil {
|
||||
log.Printf("[backlink] error: %v", err)
|
||||
} else {
|
||||
log.Printf("[backlink] done")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// RunNow runs one computation cycle immediately (for testing / manual trigger).
|
||||
func (r *Runner) RunNow() error {
|
||||
return r.compute()
|
||||
}
|
||||
|
||||
// ---- computation ----
|
||||
|
||||
type siteStats struct {
|
||||
subdomainCount map[string]int // superDomain → count
|
||||
templateCount map[string]int // htmlStructure → count
|
||||
sameIPCount map[string]int // ipPrefix → count
|
||||
serverCount map[string]int // serverType → count
|
||||
}
|
||||
|
||||
func (r *Runner) compute() error {
|
||||
stats := r.collectStats()
|
||||
|
||||
// Phase 1: HTTPS sites
|
||||
d1 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||
return info.HTTPSAvailable != nil && *info.HTTPSAvailable
|
||||
}, stats, "https_backlink")
|
||||
|
||||
// Phase 1a: second pass (echo) using d1 scores
|
||||
d1a := r.aggregateWithScores(d1, stats, "echo")
|
||||
|
||||
// Phase 2: HTTP-only sites
|
||||
d2 := r.aggregate(func(info *storage.SiteInfo) bool {
|
||||
return info.HTTPSAvailable == nil || !*info.HTTPSAvailable
|
||||
}, stats, "http_backlink")
|
||||
|
||||
// Merge
|
||||
merged := make(map[string]float64)
|
||||
for k := range union(d1, d2, d1a) {
|
||||
v := d1[k] + d1a[k] + math.Min(d1[k]*0.5+d2[k]*0.1, d2[k])
|
||||
if v > 0.16 {
|
||||
merged[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
// Save
|
||||
path := filepath.Join(r.storagePath, "prosper.json")
|
||||
if err := writeJSON(path, merged); err != nil {
|
||||
return err
|
||||
}
|
||||
log.Printf("[backlink] wrote %d entries to %s", len(merged), path)
|
||||
return nil
|
||||
}
|
||||
|
||||
// collectStats builds statistics about the site graph.
|
||||
func (r *Runner) collectStats() *siteStats {
|
||||
stats := &siteStats{
|
||||
subdomainCount: make(map[string]int),
|
||||
templateCount: make(map[string]int),
|
||||
sameIPCount: make(map[string]int),
|
||||
serverCount: make(map[string]int),
|
||||
}
|
||||
|
||||
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
||||
super := superDomain(host)
|
||||
stats.subdomainCount[super]++
|
||||
|
||||
if info.HTMLStructure != "" {
|
||||
stats.templateCount[info.HTMLStructure]++
|
||||
}
|
||||
if len(info.IPs) > 0 {
|
||||
ipStr := ipPrefix(info.IPs)
|
||||
stats.sameIPCount[ipStr]++
|
||||
}
|
||||
if len(info.ServerTypes) > 0 {
|
||||
s := strings.Join(sortedStrings(info.ServerTypes), ",")
|
||||
stats.serverCount[s]++
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Prune counts below threshold
|
||||
for k, v := range stats.subdomainCount {
|
||||
if v < 4 {
|
||||
delete(stats.subdomainCount, k)
|
||||
}
|
||||
}
|
||||
for k, v := range stats.templateCount {
|
||||
if v < 4 {
|
||||
delete(stats.templateCount, k)
|
||||
}
|
||||
}
|
||||
for k, v := range stats.sameIPCount {
|
||||
if v < 4 {
|
||||
delete(stats.sameIPCount, k)
|
||||
}
|
||||
}
|
||||
return stats
|
||||
}
|
||||
|
||||
// aggregate computes a backlink score map for sites matching the filter.
|
||||
func (r *Runner) aggregate(filter func(*storage.SiteInfo) bool, stats *siteStats, desc string) map[string]float64 {
|
||||
log.Printf("[backlink] aggregating: %s", desc)
|
||||
d := make(map[string]float64)
|
||||
ipSource := make(map[string]float64)
|
||||
|
||||
// Build server type index (top 63 most common)
|
||||
serverTable := buildServerTable(stats.serverCount)
|
||||
|
||||
type vectorEntry struct {
|
||||
domain string
|
||||
vec []float32
|
||||
}
|
||||
vectors := make(map[string][]float32)
|
||||
|
||||
pruneThreshold := 0.02
|
||||
i := 0
|
||||
|
||||
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
||||
if filter != nil && !filter(info) {
|
||||
return nil
|
||||
}
|
||||
mul := computeMul(host, info, stats)
|
||||
if mul == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
n := len(info.OutLinks)
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
|
||||
w := 1.0 / math.Max(float64(n), 50)
|
||||
xd := make(map[string]float64)
|
||||
for _, link := range info.OutLinks {
|
||||
for _, seg := range decomposeURL(link) {
|
||||
if _, exists := xd[seg]; !exists {
|
||||
xd[seg] = w
|
||||
} else {
|
||||
xd[seg] += w
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ipStr := ipPrefix(info.IPs)
|
||||
serverType := ""
|
||||
if len(info.ServerTypes) > 0 {
|
||||
serverType = info.ServerTypes[0]
|
||||
}
|
||||
serverID := serverTable[serverType]
|
||||
|
||||
for seg, segW := range xd {
|
||||
fw := math.Min(segW, 0.15) * mul
|
||||
prev := d[seg]
|
||||
d[seg] = prev + fw
|
||||
|
||||
if prev > 0.2 {
|
||||
if _, sameIP := stats.sameIPCount[ipStr]; ipStr != "" && sameIP {
|
||||
key := seg + "-" + ipStr
|
||||
if ipSource[key] > 0.4 {
|
||||
continue
|
||||
}
|
||||
ipSource[key] += fw
|
||||
}
|
||||
}
|
||||
|
||||
if prev > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
||||
if vectors[seg] == nil {
|
||||
vectors[seg] = make([]float32, 64)
|
||||
}
|
||||
vectors[seg][serverID] += float32(fw)
|
||||
}
|
||||
}
|
||||
|
||||
i++
|
||||
if i%200000 == 0 {
|
||||
// Prune low-score entries
|
||||
for k, v := range d {
|
||||
if v < pruneThreshold {
|
||||
delete(d, k)
|
||||
}
|
||||
}
|
||||
pruneThreshold *= 1.1
|
||||
}
|
||||
if i%400000 == 0 {
|
||||
for k, v := range ipSource {
|
||||
if v < 0.04 {
|
||||
delete(ipSource, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
// Vectorised cosine filtering
|
||||
d = vectorFilter(d, vectors, desc)
|
||||
|
||||
// Prune
|
||||
for k, v := range d {
|
||||
if v <= 0.16 {
|
||||
delete(d, k)
|
||||
}
|
||||
}
|
||||
|
||||
log.Printf("[backlink] %s: %d entries", desc, len(d))
|
||||
return d
|
||||
}
|
||||
|
||||
// aggregateWithScores does a second pass weighted by existing scores.
|
||||
func (r *Runner) aggregateWithScores(scores map[string]float64, stats *siteStats, desc string) map[string]float64 {
|
||||
log.Printf("[backlink] aggregating with scores: %s", desc)
|
||||
d := make(map[string]float64)
|
||||
serverTable := buildServerTable(stats.serverCount)
|
||||
vectors := make(map[string][]float32)
|
||||
|
||||
_ = r.db.ForEachSite(func(host string, info *storage.SiteInfo) error {
|
||||
score, ok := scores[host]
|
||||
if !ok || strings.Contains(host, "/") {
|
||||
return nil
|
||||
}
|
||||
mul := computeMul(host, info, stats)
|
||||
if mul == 0 {
|
||||
return nil
|
||||
}
|
||||
trueMul := math.Min(2, mul*math.Log2(2+score))
|
||||
|
||||
n := len(info.OutLinks)
|
||||
if n == 0 {
|
||||
return nil
|
||||
}
|
||||
w := 1.0 / math.Max(float64(n), 50)
|
||||
xd := make(map[string]float64)
|
||||
for _, link := range info.OutLinks {
|
||||
for _, seg := range decomposeURL(link) {
|
||||
xd[seg] += w
|
||||
}
|
||||
}
|
||||
serverType := ""
|
||||
if len(info.ServerTypes) > 0 {
|
||||
serverType = info.ServerTypes[0]
|
||||
}
|
||||
serverID := serverTable[serverType]
|
||||
|
||||
for seg, segW := range xd {
|
||||
fw := math.Min(segW, 0.15) * trueMul
|
||||
d[seg] += fw
|
||||
if d[seg] > 0.21 && !strings.Contains(seg, "/") && serverType != "" {
|
||||
if vectors[seg] == nil {
|
||||
vectors[seg] = make([]float32, 64)
|
||||
}
|
||||
vectors[seg][serverID] += float32(fw)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
|
||||
d = vectorFilter(d, vectors, desc)
|
||||
for k, v := range d {
|
||||
if v <= 0.16 {
|
||||
delete(d, k)
|
||||
}
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// ---- vector cosine filtering ----
|
||||
|
||||
func vectorFilter(d map[string]float64, vectors map[string][]float32, desc string) map[string]float64 {
|
||||
// Compute core vector (sum of all)
|
||||
core := make([]float64, 64)
|
||||
for _, vec := range vectors {
|
||||
for j, v := range vec {
|
||||
core[j] += float64(v)
|
||||
}
|
||||
}
|
||||
coreNorm := norm64(core)
|
||||
if coreNorm == 0 {
|
||||
return d
|
||||
}
|
||||
|
||||
newD := make(map[string]float64, len(d))
|
||||
for k, v := range d {
|
||||
baseK := strings.Split(k, "/")[0]
|
||||
if v > 0.21 && vectors[baseK] != nil {
|
||||
vec := vectors[baseK]
|
||||
vecNorm := float64(norm32(vec))
|
||||
if vecNorm == 0 {
|
||||
newD[k] = v
|
||||
continue
|
||||
}
|
||||
cos := dot32_64(vec, core) / (vecNorm * coreNorm)
|
||||
if cos > 1.01 {
|
||||
cos = 1.01
|
||||
}
|
||||
newV := math.Max(v*(0.25+cos*0.75), 0.21)
|
||||
newD[k] = newV
|
||||
} else {
|
||||
newD[k] = v
|
||||
}
|
||||
}
|
||||
|
||||
// Save cos map for diagnostics
|
||||
cosMap := make(map[string]float64)
|
||||
for k, vec := range vectors {
|
||||
vn := float64(norm32(vec))
|
||||
if vn > 0 {
|
||||
cosMap[k] = dot32_64(vec, core) / (vn * coreNorm)
|
||||
}
|
||||
}
|
||||
_ = writeJSON(desc+"_cos.json", cosMap)
|
||||
|
||||
return newD
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
|
||||
func computeMul(host string, info *storage.SiteInfo, stats *siteStats) float64 {
|
||||
if len(info.OutLinks) == 0 {
|
||||
return 0
|
||||
}
|
||||
t := info.LastVisitTime
|
||||
if t == 0 {
|
||||
t = 1640000000
|
||||
}
|
||||
days := (time.Now().Unix() - t) / (3600 * 24)
|
||||
if days > 180 {
|
||||
return 0
|
||||
}
|
||||
timeMul := math.Pow(0.99, float64(days))
|
||||
|
||||
super := superDomain(host)
|
||||
subCount := max(stats.subdomainCount[super], 1)
|
||||
tplCount := 1
|
||||
if info.HTMLStructure != "" {
|
||||
tplCount = max(stats.templateCount[info.HTMLStructure], 1)
|
||||
}
|
||||
count := max(subCount, int(float64(tplCount)*1.5))
|
||||
if count > 1000 {
|
||||
if rand.Float64() > 1000.0/float64(count) {
|
||||
return 0
|
||||
}
|
||||
count = 1000
|
||||
}
|
||||
domainMul := 1.0 / math.Pow(math.Max(float64(count), 5)/5, 0.6)
|
||||
return timeMul * domainMul
|
||||
}
|
||||
|
||||
func superDomain(host string) string {
|
||||
parts := strings.Split(host, ".")
|
||||
if len(parts) >= 2 {
|
||||
return strings.Join(parts[len(parts)-2:], ".")
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
func ipPrefix(ips []string) string {
|
||||
if len(ips) == 0 {
|
||||
return ""
|
||||
}
|
||||
sorted := sortedStrings(ips)
|
||||
parts := make([]string, len(sorted))
|
||||
for i, ip := range sorted {
|
||||
idx := strings.LastIndex(ip, ".")
|
||||
if idx > 0 {
|
||||
parts[i] = ip[:idx]
|
||||
} else {
|
||||
parts[i] = ip
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func decomposeURL(rawURL string) []string {
|
||||
u := strings.ToLower(rawURL)
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
u = u[8:]
|
||||
} else if strings.HasPrefix(u, "http://") {
|
||||
u = u[7:]
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
u = strings.ReplaceAll(u, "?", "/")
|
||||
u = strings.ReplaceAll(u, "#", "/")
|
||||
u = strings.TrimRight(u, "/")
|
||||
if u == "" || u[0] == '/' || u[0] == '%' {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(u, "/")
|
||||
var out []string
|
||||
current := parts[0]
|
||||
out = append(out, current)
|
||||
for _, p := range parts[1:] {
|
||||
current = current + "/" + p
|
||||
out = append(out, current)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func buildServerTable(serverCount map[string]int) map[string]int {
|
||||
type kv struct {
|
||||
k string
|
||||
v int
|
||||
}
|
||||
var sorted []kv
|
||||
for k, v := range serverCount {
|
||||
sorted = append(sorted, kv{k, v})
|
||||
}
|
||||
for i := 0; i < len(sorted)-1; i++ {
|
||||
for j := i + 1; j < len(sorted); j++ {
|
||||
if sorted[j].v > sorted[i].v {
|
||||
sorted[i], sorted[j] = sorted[j], sorted[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
table := make(map[string]int, 63)
|
||||
limit := 63
|
||||
if len(sorted) < limit {
|
||||
limit = len(sorted)
|
||||
}
|
||||
for i := 0; i < limit; i++ {
|
||||
table[sorted[i].k] = i + 1
|
||||
}
|
||||
return table
|
||||
}
|
||||
|
||||
func sortedStrings(s []string) []string {
|
||||
cp := make([]string, len(s))
|
||||
copy(cp, s)
|
||||
for i := 0; i < len(cp)-1; i++ {
|
||||
for j := i + 1; j < len(cp); j++ {
|
||||
if cp[j] < cp[i] {
|
||||
cp[i], cp[j] = cp[j], cp[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
return cp
|
||||
}
|
||||
|
||||
func norm64(v []float64) float64 {
|
||||
s := 0.0
|
||||
for _, x := range v {
|
||||
s += x * x
|
||||
}
|
||||
return math.Sqrt(s)
|
||||
}
|
||||
|
||||
func norm32(v []float32) float32 {
|
||||
s := float32(0)
|
||||
for _, x := range v {
|
||||
s += x * x
|
||||
}
|
||||
return float32(math.Sqrt(float64(s)))
|
||||
}
|
||||
|
||||
func dot32_64(a []float32, b []float64) float64 {
|
||||
s := 0.0
|
||||
for i := range a {
|
||||
s += float64(a[i]) * b[i]
|
||||
}
|
||||
return s
|
||||
}
|
||||
|
||||
func union(maps ...map[string]float64) map[string]bool {
|
||||
out := make(map[string]bool)
|
||||
for _, m := range maps {
|
||||
for k := range m {
|
||||
out[k] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func writeJSON(path string, data interface{}) error {
|
||||
_ = os.MkdirAll(filepath.Dir(path), 0o755)
|
||||
b, err := json.MarshalIndent(data, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return os.WriteFile(path, b, 0o644)
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
// Package config holds all global configuration parameters for sese-engine.
|
||||
package config
|
||||
|
||||
// Index / storage limits
|
||||
const (
|
||||
MaxURLsPerKey = 11000 // max URLs stored per index key
|
||||
MaxSameDomainPerKey = 20 // max URLs from the same domain per key
|
||||
BigCleanThreshold = 10000000 // flush in-memory index after this many rows
|
||||
MaxNewURLsPerKey = 10000 // cap on new URLs added per key per flush
|
||||
MinURLsForNewKey = 3 // discard new keys with fewer than this many URLs
|
||||
)
|
||||
|
||||
// Crawler settings
|
||||
const (
|
||||
SpiderName = "loli_spider"
|
||||
CrawlerCooldown = 3 // seconds between requests to the same host
|
||||
CrawlerWorkers = 22 // goroutine pool size for crawling
|
||||
CrawlFocus = 0.7 // concentration factor — higher = more focused on single domain
|
||||
MaxKeywordsPerPage = 250
|
||||
MaxEpoch = 100
|
||||
ExpectedProsperRatio = 0.6 // fraction of queue that should be "prosperous" (high backlink) domains
|
||||
EntryURL = "https://zh.wikipedia.org/"
|
||||
)
|
||||
|
||||
// Search / ranking weights
|
||||
const (
|
||||
UseOnlineSnippet = true
|
||||
OnlineSnippetTimeout = 3 // seconds
|
||||
WeightDailyDecay = 0.996
|
||||
LanguageWeight = 0.5
|
||||
ConsecutiveKeyWeight = 1.3
|
||||
BacklinkWeight = 1.0
|
||||
SearchServerPort = 80
|
||||
)
|
||||
|
||||
// Backlink computation
|
||||
const (
|
||||
BacklinkBaseline = 200000 // normalization divisor for backlink scores
|
||||
)
|
||||
|
||||
// Storage path (relative to process working directory)
|
||||
const StoragePath = "./savedata"
|
||||
|
||||
// Prometheus ports
|
||||
const (
|
||||
PromPortCrawler = 14950
|
||||
PromPortHarvester = 14951
|
||||
PromPortBacklink = 14952
|
||||
PromPortSearch = 14953
|
||||
)
|
||||
|
||||
// Harvester HTTP endpoint
|
||||
const HarvesterAddr = "http://127.0.0.1:5000"
|
||||
@@ -0,0 +1,588 @@
|
||||
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"sese-engine/analyzer"
|
||||
"sese-engine/config"
|
||||
"sese-engine/parser"
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
|
||||
// Stats holds real-time crawl counters (read with atomic).
|
||||
type Stats struct {
|
||||
VisitedURLs int64
|
||||
SuccessURLs int64
|
||||
KeywordsFetched int64
|
||||
}
|
||||
|
||||
// Crawler orchestrates the BFS crawl.
|
||||
type Crawler struct {
|
||||
fetcher *Fetcher
|
||||
db *storage.DB
|
||||
analyzer *analyzer.Analyzer
|
||||
prosperMap map[string]float64 // domain → backlink score (loaded from info)
|
||||
stats Stats
|
||||
}
|
||||
|
||||
// New creates a Crawler.
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
return &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
||||
db: db,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
}
|
||||
}
|
||||
|
||||
// URLWeight pairs a URL with its discovery weight.
|
||||
type URLWeight struct {
|
||||
URL string
|
||||
Weight float64
|
||||
}
|
||||
|
||||
// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
|
||||
// It blocks until completion.
|
||||
func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
visited := make(map[string]bool)
|
||||
queue := []string{entryURL}
|
||||
|
||||
for ep := 0; ep < maxEpoch; ep++ {
|
||||
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
|
||||
for _, u := range queue {
|
||||
visited[u] = true
|
||||
}
|
||||
|
||||
var (
|
||||
newLinks []URLWeight
|
||||
mu sync.Mutex
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
|
||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
||||
for _, u := range queue {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(rawURL string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
hrefs := c.visitURL(rawURL)
|
||||
n := len(hrefs)
|
||||
if n > 0 {
|
||||
w := 1.0 / float64(n)
|
||||
mu.Lock()
|
||||
for _, h := range hrefs {
|
||||
if !visited[h] {
|
||||
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
}(u)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if len(newLinks) == 0 {
|
||||
log.Println("[crawler] empty queue — stopping")
|
||||
return
|
||||
}
|
||||
|
||||
queue = c.schedule(newLinks)
|
||||
}
|
||||
}
|
||||
|
||||
// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
|
||||
func (c *Crawler) visitURL(rawURL string) []string {
|
||||
atomic.AddInt64(&c.stats.VisitedURLs, 1)
|
||||
|
||||
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
|
||||
if err != nil || res == nil {
|
||||
c.updateSiteFailure(rawURL)
|
||||
return nil
|
||||
}
|
||||
|
||||
atomic.AddInt64(&c.stats.SuccessURLs, 1)
|
||||
|
||||
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
|
||||
|
||||
// Cache snippet
|
||||
if len(res.FinalURL) < 250 {
|
||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
Title: title,
|
||||
Description: truncate(desc, 256),
|
||||
Text: truncate(text, 256),
|
||||
Timestamp: time.Now().Unix(),
|
||||
})
|
||||
}
|
||||
|
||||
// Keyword extraction → send to harvester
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
if len(kws) > config.MaxKeywordsPerPage {
|
||||
kws = kws[:config.MaxKeywordsPerPage]
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
go c.sendToHarvester(res.FinalURL, kws)
|
||||
}
|
||||
|
||||
// Update site info
|
||||
host := netloc(res.FinalURL)
|
||||
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
|
||||
|
||||
// Handle permanent redirects in site info
|
||||
for from, to := range res.Redirects {
|
||||
fromHost := netloc(from)
|
||||
if fromHost == "" {
|
||||
continue
|
||||
}
|
||||
info, _ := c.db.GetSiteInfo(fromHost)
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
info.Redirects[from] = to
|
||||
if len(info.Redirects) > 50 {
|
||||
// keep most important (just truncate randomly for now)
|
||||
info.Redirects = truncateMap(info.Redirects, 40)
|
||||
}
|
||||
_ = c.db.SetSiteInfo(fromHost, info)
|
||||
}
|
||||
|
||||
// Trim hrefs
|
||||
if len(hrefs) > 100 {
|
||||
hrefs = sampleStrings(hrefs, 100)
|
||||
}
|
||||
return hrefs
|
||||
}
|
||||
|
||||
func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
host := netloc(rawURL)
|
||||
if host == "" {
|
||||
return
|
||||
}
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
if info.SuccessRate == nil {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
}
|
||||
*info.SuccessRate *= 0.99
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = time.Now().Unix()
|
||||
|
||||
one := 1.0
|
||||
if info.SuccessRate == nil {
|
||||
info.SuccessRate = &one
|
||||
}
|
||||
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
||||
|
||||
if strings.HasPrefix(res.FinalURL, "https://") {
|
||||
t := true
|
||||
info.HTTPSAvailable = &t
|
||||
}
|
||||
|
||||
if res.ServerType != "" {
|
||||
found := false
|
||||
for _, s := range info.ServerTypes {
|
||||
if s == res.ServerType {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
info.ServerTypes = append(info.ServerTypes, res.ServerType)
|
||||
if len(info.ServerTypes) > 5 {
|
||||
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Language detection — sample 10% or first 10 visits
|
||||
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
||||
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
||||
if lang != "" {
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
||||
for k := range info.Languages {
|
||||
info.Languages[k] *= (1 - intensity)
|
||||
}
|
||||
info.Languages[lang] += intensity
|
||||
}
|
||||
// Collect external links
|
||||
superHost := superNetloc(res.FinalURL)
|
||||
var external []string
|
||||
for _, h := range hrefs {
|
||||
if superNetloc(h) != superHost {
|
||||
external = append(external, h)
|
||||
}
|
||||
}
|
||||
sampled := sampleStrings(external, 10)
|
||||
info.OutLinks = append(info.OutLinks, sampled...)
|
||||
if len(info.OutLinks) > 250 {
|
||||
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
||||
}
|
||||
}
|
||||
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
// sendToHarvester POSTs keyword data to the harvester service.
|
||||
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
type payload struct {
|
||||
URL string `json:"url"`
|
||||
Keywords []analyzer.Keyword `json:"keywords"`
|
||||
}
|
||||
p := payload{URL: finalURL, Keywords: kws}
|
||||
data, err := json.Marshal(p)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data))
|
||||
if err != nil {
|
||||
log.Printf("[crawler] harvester post failed: %v", err)
|
||||
return
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// schedule selects and prioritises the next BFS queue from raw discovered links.
|
||||
func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
if len(links) > 100000 {
|
||||
links = sampleURLWeights(links, 100000)
|
||||
}
|
||||
|
||||
// Pre-fetch site info for all involved domains
|
||||
domains := make(map[string]bool)
|
||||
for _, lw := range links {
|
||||
if h := netloc(lw.URL); h != "" {
|
||||
domains[h] = true
|
||||
}
|
||||
if h := superNetloc(lw.URL); h != "" {
|
||||
domains[h] = true
|
||||
}
|
||||
}
|
||||
siteCache := make(map[string]*storage.SiteInfo, len(domains))
|
||||
var mu sync.Mutex
|
||||
var wg sync.WaitGroup
|
||||
for d := range domains {
|
||||
wg.Add(1)
|
||||
go func(host string) {
|
||||
defer wg.Done()
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
mu.Lock()
|
||||
siteCache[host] = info
|
||||
mu.Unlock()
|
||||
}(d)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Score each URL
|
||||
scored_list := make([]scoredURL, len(links))
|
||||
for i, lw := range links {
|
||||
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
|
||||
}
|
||||
|
||||
// Weighted random sample (45000 or 1/3+250 whichever smaller)
|
||||
k := min(45000, len(scored_list)/3+250)
|
||||
selected := weightedSample(scored_list, k)
|
||||
|
||||
// Domain concentration filtering
|
||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
||||
|
||||
// Separate https/http, cap http at 1/4 of https count
|
||||
var httpsURLs, httpURLs []string
|
||||
for _, s := range selected {
|
||||
if strings.HasPrefix(s, "https://") {
|
||||
httpsURLs = append(httpsURLs, s)
|
||||
} else {
|
||||
httpURLs = append(httpURLs, s)
|
||||
}
|
||||
}
|
||||
maxHTTP := len(httpsURLs) / 4
|
||||
if len(httpURLs) > maxHTTP {
|
||||
httpURLs = sampleStrings(httpURLs, maxHTTP)
|
||||
}
|
||||
|
||||
// Separate prosperous / non-prosperous
|
||||
var prosperURLs, otherURLs []string
|
||||
for _, u := range append(httpsURLs, httpURLs...) {
|
||||
if c.prosperMap[netloc(u)] > 0 {
|
||||
prosperURLs = append(prosperURLs, u)
|
||||
} else {
|
||||
otherURLs = append(otherURLs, u)
|
||||
}
|
||||
}
|
||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
||||
if len(otherURLs) > n {
|
||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||
if keep < len(otherURLs) {
|
||||
otherURLs = sampleStrings(otherURLs, keep)
|
||||
}
|
||||
}
|
||||
|
||||
result := append(prosperURLs, otherURLs...)
|
||||
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
||||
return result
|
||||
}
|
||||
|
||||
// scoreURL computes the scheduling priority for a URL.
|
||||
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
|
||||
host := netloc(lw.URL)
|
||||
super := superNetloc(lw.URL)
|
||||
|
||||
info := siteCache[host]
|
||||
if info == nil {
|
||||
info = &storage.SiteInfo{}
|
||||
}
|
||||
|
||||
// Chinese-ness
|
||||
var chineseness float64 = 0.5
|
||||
if len(info.Languages) > 0 {
|
||||
total := 0.0
|
||||
for _, v := range info.Languages {
|
||||
total += v
|
||||
}
|
||||
if total > 0 {
|
||||
chineseness = info.Languages["zh"] / total
|
||||
}
|
||||
}
|
||||
|
||||
// Interest decay based on visit count
|
||||
prosper := math.Min(62, c.prosperMap[host])
|
||||
limit := prosper*500 + 50
|
||||
b := math.Pow(0.1, 1/limit)
|
||||
interest := math.Pow(b, float64(info.VisitCount))
|
||||
|
||||
var interest2 float64 = 1.0
|
||||
if super != host {
|
||||
superInfo := siteCache[super]
|
||||
if superInfo != nil {
|
||||
limit2 := math.Min(62, c.prosperMap[super])*500 + 50
|
||||
b2 := math.Pow(0.1, 1/limit2)
|
||||
interest2 = math.Pow(b2, float64(superInfo.VisitCount))
|
||||
}
|
||||
}
|
||||
|
||||
quality := 1.0
|
||||
if info.Quality != nil {
|
||||
quality = *info.Quality
|
||||
}
|
||||
|
||||
prosperity := prosper
|
||||
if prosperity > 0 {
|
||||
prosperity += 0.5
|
||||
}
|
||||
prosperity = math.Log2(2+prosperity) + 1
|
||||
|
||||
bad := badURL(lw.URL)
|
||||
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
|
||||
}
|
||||
|
||||
// ---- helper functions ----
|
||||
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
return parts[2]
|
||||
}
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return u.Host
|
||||
}
|
||||
|
||||
// superNetloc returns "domain.tld" (strips subdomains).
|
||||
func superNetloc(rawURL string) string {
|
||||
host := netloc(rawURL)
|
||||
parts := strings.Split(host, ".")
|
||||
if len(parts) >= 2 {
|
||||
return strings.Join(parts[len(parts)-2:], ".")
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
func badURL(u string) float64 {
|
||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
||||
s += (1 - s) * 0.1
|
||||
}
|
||||
if len(u) < 5 || u[4] == ':' {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
return math.Min(s, 0.9)
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n]
|
||||
}
|
||||
|
||||
func sampleStrings(s []string, n int) []string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
perm := rand.Perm(len(s))
|
||||
out := make([]string, n)
|
||||
for i := range out {
|
||||
out[i] = s[perm[i]]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
perm := rand.Perm(len(s))
|
||||
out := make([]URLWeight, n)
|
||||
for i := range out {
|
||||
out[i] = s[perm[i]]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type scoredURL struct {
|
||||
url string
|
||||
score float64
|
||||
}
|
||||
|
||||
func weightedSample(items []scoredURL, k int) []string {
|
||||
if k >= len(items) {
|
||||
out := make([]string, len(items))
|
||||
for i, s := range items {
|
||||
out[i] = s.url
|
||||
}
|
||||
return out
|
||||
}
|
||||
// Simple weighted sampling without replacement using alias method approximation
|
||||
totalWeight := 0.0
|
||||
for _, s := range items {
|
||||
totalWeight += s.score
|
||||
}
|
||||
selected := make(map[int]bool)
|
||||
out := make([]string, 0, k)
|
||||
for len(out) < k && len(selected) < len(items) {
|
||||
r := rand.Float64() * totalWeight
|
||||
cum := 0.0
|
||||
for i, s := range items {
|
||||
if selected[i] {
|
||||
continue
|
||||
}
|
||||
cum += s.score
|
||||
if cum >= r {
|
||||
selected[i] = true
|
||||
out = append(out, s.url)
|
||||
totalWeight -= s.score
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func concentrationFilter(urls []string, k float64) []string {
|
||||
domainGroups := make(map[string][]string)
|
||||
shuffled := make([]string, len(urls))
|
||||
copy(shuffled, urls)
|
||||
rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })
|
||||
|
||||
for _, u := range shuffled {
|
||||
d := superNetloc(u)
|
||||
domainGroups[d] = append(domainGroups[d], u)
|
||||
}
|
||||
|
||||
limit := 10
|
||||
if len(domainGroups) > 1 {
|
||||
sizes := make([]int, 0, len(domainGroups))
|
||||
for _, g := range domainGroups {
|
||||
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
|
||||
}
|
||||
// sort sizes ascending, drop last (largest)
|
||||
for i := 0; i < len(sizes)-1; i++ {
|
||||
for j := i + 1; j < len(sizes)-1; j++ {
|
||||
if sizes[j] < sizes[i] {
|
||||
sizes[i], sizes[j] = sizes[j], sizes[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
total := 0
|
||||
for _, s := range sizes[:len(sizes)-1] {
|
||||
total += s
|
||||
}
|
||||
limit = max(10, int(float64(total)*0.6))
|
||||
}
|
||||
|
||||
var result []string
|
||||
for _, g := range domainGroups {
|
||||
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
|
||||
if sn > len(g) {
|
||||
sn = len(g)
|
||||
}
|
||||
result = append(result, g[:sn]...)
|
||||
}
|
||||
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
||||
return result
|
||||
}
|
||||
|
||||
func truncateMap(m map[string]string, n int) map[string]string {
|
||||
if len(m) <= n {
|
||||
return m
|
||||
}
|
||||
out := make(map[string]string, n)
|
||||
i := 0
|
||||
for k, v := range m {
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
out[k] = v
|
||||
i++
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// Expose Stats for monitoring.
|
||||
func (c *Crawler) GetStats() Stats {
|
||||
return Stats{
|
||||
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
|
||||
SuccessURLs: atomic.LoadInt64(&c.stats.SuccessURLs),
|
||||
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,313 @@
|
||||
// Package crawler implements the HTTP fetching layer with robots.txt compliance,
|
||||
// per-host rate limiting, redirect tracking, and encoding detection.
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/html/charset"
|
||||
)
|
||||
|
||||
// ErrCrawl is returned for expected crawl failures (404, disallowed, wrong content type…).
|
||||
type ErrCrawl struct {
|
||||
Msg string
|
||||
}
|
||||
|
||||
func (e *ErrCrawl) Error() string { return e.Msg }
|
||||
|
||||
// FetchResult bundles the result of a successful fetch.
|
||||
type FetchResult struct {
|
||||
Body string // decoded HTML body
|
||||
FinalURL string // URL after redirects
|
||||
Redirects map[string]string // permanent redirects: from → to
|
||||
ServerType string
|
||||
}
|
||||
|
||||
// Fetcher is a reusable HTTP client with robots.txt awareness and rate limiting.
|
||||
type Fetcher struct {
|
||||
client *http.Client
|
||||
userAgent string
|
||||
cooldown time.Duration
|
||||
|
||||
rateMu sync.Mutex
|
||||
lastHit map[string]time.Time // host → last request time
|
||||
|
||||
robotsMu sync.Mutex
|
||||
robots map[string]*robotsEntry // host → parsed robots
|
||||
}
|
||||
|
||||
type robotsEntry struct {
|
||||
rules []robotsRule
|
||||
fetchedAt time.Time
|
||||
}
|
||||
|
||||
type robotsRule struct {
|
||||
userAgent string
|
||||
disallow []string
|
||||
allow []string
|
||||
}
|
||||
|
||||
// NewFetcher creates a Fetcher with the given user-agent and per-host cooldown.
|
||||
func NewFetcher(userAgent string, cooldown time.Duration) *Fetcher {
|
||||
return &Fetcher{
|
||||
client: &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
return nil
|
||||
},
|
||||
},
|
||||
userAgent: userAgent,
|
||||
cooldown: cooldown,
|
||||
lastHit: make(map[string]time.Time),
|
||||
robots: make(map[string]*robotsEntry),
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch fetches url, respecting robots.txt and rate limits.
|
||||
// polite=false skips both checks (used by search server snippet fetcher).
|
||||
func (f *Fetcher) Fetch(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||
return f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
||||
}
|
||||
|
||||
// FetchSafe wraps Fetch and returns (nil, nil) on expected errors.
|
||||
func (f *Fetcher) FetchSafe(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||
res, err := f.fetchWithHistory(rawURL, polite, timeout, sizeLimit)
|
||||
if _, ok := err.(*ErrCrawl); ok {
|
||||
return nil, nil
|
||||
}
|
||||
return res, err
|
||||
}
|
||||
|
||||
// fetchWithHistory does the actual request and populates redirect history.
|
||||
func (f *Fetcher) fetchWithHistory(rawURL string, polite bool, timeout time.Duration, sizeLimit int) (*FetchResult, error) {
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return nil, &ErrCrawl{Msg: "invalid url: " + err.Error()}
|
||||
}
|
||||
host := parsed.Host
|
||||
|
||||
if polite {
|
||||
f.rateLimit(host)
|
||||
if !f.robotsAllowed(rawURL, host) {
|
||||
return nil, &ErrCrawl{Msg: "disallowed by robots.txt"}
|
||||
}
|
||||
}
|
||||
|
||||
redirects := make(map[string]string)
|
||||
client := &http.Client{
|
||||
Timeout: timeout,
|
||||
CheckRedirect: func(req *http.Request, via []*http.Request) error {
|
||||
if len(via) >= 10 {
|
||||
return fmt.Errorf("too many redirects")
|
||||
}
|
||||
if req.Response != nil && (req.Response.StatusCode == 301 || req.Response.StatusCode == 308) {
|
||||
from := via[len(via)-1].URL.String()
|
||||
to := req.URL.String()
|
||||
redirects[from] = to
|
||||
}
|
||||
return nil
|
||||
},
|
||||
}
|
||||
|
||||
req, _ := http.NewRequest("GET", rawURL, nil)
|
||||
req.Header.Set("User-Agent", f.userAgent)
|
||||
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode == 404 {
|
||||
return nil, &ErrCrawl{Msg: "404 not found"}
|
||||
}
|
||||
if resp.StatusCode >= 400 {
|
||||
return nil, &ErrCrawl{Msg: fmt.Sprintf("HTTP %d", resp.StatusCode)}
|
||||
}
|
||||
ct := resp.Header.Get("Content-Type")
|
||||
if !strings.Contains(ct, "text/html") {
|
||||
return nil, &ErrCrawl{Msg: "not html: " + ct}
|
||||
}
|
||||
|
||||
body, err := decodeBody(resp.Body, ct, sizeLimit)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &FetchResult{
|
||||
Body: body,
|
||||
FinalURL: resp.Request.URL.String(),
|
||||
Redirects: redirects,
|
||||
ServerType: resp.Header.Get("Server"),
|
||||
}, nil
|
||||
}
|
||||
|
||||
// rateLimit sleeps if the last request to host was too recent.
|
||||
func (f *Fetcher) rateLimit(host string) {
|
||||
f.rateMu.Lock()
|
||||
last, ok := f.lastHit[host]
|
||||
now := time.Now()
|
||||
f.lastHit[host] = now
|
||||
// Periodically prune the map
|
||||
if len(f.lastHit) > 10000 {
|
||||
cutoff := now.Add(-f.cooldown * 2)
|
||||
for k, v := range f.lastHit {
|
||||
if v.Before(cutoff) {
|
||||
delete(f.lastHit, k)
|
||||
}
|
||||
}
|
||||
}
|
||||
f.rateMu.Unlock()
|
||||
|
||||
if ok {
|
||||
elapsed := now.Sub(last)
|
||||
if elapsed < f.cooldown {
|
||||
time.Sleep(f.cooldown - elapsed)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// robotsAllowed returns true if rawURL is crawlable.
|
||||
func (f *Fetcher) robotsAllowed(rawURL, host string) bool {
|
||||
f.robotsMu.Lock()
|
||||
entry, ok := f.robots[host]
|
||||
f.robotsMu.Unlock()
|
||||
|
||||
if !ok || time.Since(entry.fetchedAt) > 24*time.Hour {
|
||||
entry = f.fetchRobots(host, rawURL)
|
||||
f.robotsMu.Lock()
|
||||
f.robots[host] = entry
|
||||
f.robotsMu.Unlock()
|
||||
}
|
||||
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
path := parsed.Path
|
||||
if path == "" {
|
||||
path = "/"
|
||||
}
|
||||
|
||||
for _, rule := range entry.rules {
|
||||
if rule.userAgent != "*" && !strings.EqualFold(rule.userAgent, f.userAgent) {
|
||||
continue
|
||||
}
|
||||
// Check allow first (higher priority)
|
||||
for _, a := range rule.allow {
|
||||
if strings.HasPrefix(path, a) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
for _, dis := range rule.disallow {
|
||||
if dis != "" && strings.HasPrefix(path, dis) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// fetchRobots downloads and parses robots.txt for a host.
|
||||
func (f *Fetcher) fetchRobots(host, exampleURL string) *robotsEntry {
|
||||
entry := &robotsEntry{fetchedAt: time.Now()}
|
||||
scheme := "https"
|
||||
if strings.HasPrefix(exampleURL, "http://") {
|
||||
scheme = "http"
|
||||
}
|
||||
robotsURL := fmt.Sprintf("%s://%s/robots.txt", scheme, host)
|
||||
|
||||
client := &http.Client{Timeout: 5 * time.Second}
|
||||
req, _ := http.NewRequest("GET", robotsURL, nil)
|
||||
req.Header.Set("User-Agent", f.userAgent)
|
||||
resp, err := client.Do(req)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return entry // allow all if robots.txt unavailable
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(io.LimitReader(resp.Body, 256*1024))
|
||||
if err != nil {
|
||||
return entry
|
||||
}
|
||||
entry.rules = parseRobots(string(body))
|
||||
return entry
|
||||
}
|
||||
|
||||
// parseRobots is a minimal robots.txt parser.
|
||||
func parseRobots(content string) []robotsRule {
|
||||
var rules []robotsRule
|
||||
var current *robotsRule
|
||||
for _, line := range strings.Split(content, "\n") {
|
||||
line = strings.TrimSpace(line)
|
||||
if idx := strings.Index(line, "#"); idx >= 0 {
|
||||
line = line[:idx]
|
||||
}
|
||||
if line == "" {
|
||||
if current != nil {
|
||||
rules = append(rules, *current)
|
||||
current = nil
|
||||
}
|
||||
continue
|
||||
}
|
||||
parts := strings.SplitN(line, ":", 2)
|
||||
if len(parts) != 2 {
|
||||
continue
|
||||
}
|
||||
key := strings.TrimSpace(strings.ToLower(parts[0]))
|
||||
val := strings.TrimSpace(parts[1])
|
||||
switch key {
|
||||
case "user-agent":
|
||||
if current == nil {
|
||||
current = &robotsRule{userAgent: val}
|
||||
} else {
|
||||
current.userAgent = val
|
||||
}
|
||||
case "disallow":
|
||||
if current != nil {
|
||||
current.disallow = append(current.disallow, val)
|
||||
}
|
||||
case "allow":
|
||||
if current != nil {
|
||||
current.allow = append(current.allow, val)
|
||||
}
|
||||
}
|
||||
}
|
||||
if current != nil {
|
||||
rules = append(rules, *current)
|
||||
}
|
||||
return rules
|
||||
}
|
||||
|
||||
// decodeBody reads at most sizeLimit bytes from r, auto-detecting charset.
|
||||
func decodeBody(r io.Reader, contentType string, sizeLimit int) (string, error) {
|
||||
var reader io.Reader = r
|
||||
if sizeLimit > 0 {
|
||||
reader = io.LimitReader(r, int64(sizeLimit))
|
||||
}
|
||||
|
||||
// Use golang.org/x/net/html/charset for auto-detection
|
||||
utf8Reader, err := charset.NewReader(reader, contentType)
|
||||
if err != nil {
|
||||
// Fall back to reading raw and hoping for UTF-8
|
||||
data, readErr := io.ReadAll(reader)
|
||||
if readErr != nil {
|
||||
return "", readErr
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
data, err := io.ReadAll(utf8Reader)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
return string(data), nil
|
||||
}
|
||||
@@ -0,0 +1,19 @@
|
||||
module sese-engine
|
||||
|
||||
go 1.21
|
||||
|
||||
require (
|
||||
github.com/andybalholm/brotli v1.1.0
|
||||
github.com/pemistahl/lingua-go v1.4.0
|
||||
github.com/yanyiwu/gojieba v1.4.4
|
||||
go.etcd.io/bbolt v1.3.9
|
||||
golang.org/x/net v0.23.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/shopspring/decimal v1.3.1 // indirect
|
||||
golang.org/x/exp v0.0.0-20221106115401-f9659909a136 // indirect
|
||||
golang.org/x/sys v0.18.0 // indirect
|
||||
golang.org/x/text v0.14.0 // indirect
|
||||
google.golang.org/protobuf v1.31.0 // indirect
|
||||
)
|
||||
@@ -0,0 +1,36 @@
|
||||
github.com/andybalholm/brotli v1.1.0 h1:eLKJA0d02Lf0mVpIDgYnqXcUn0GqVmEFny3VuID1U3M=
|
||||
github.com/andybalholm/brotli v1.1.0/go.mod h1:sms7XGricyQI9K10gOSf56VKKWS4oLer58Q+mhRPtnY=
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk=
|
||||
github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE=
|
||||
github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg=
|
||||
github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
|
||||
github.com/pemistahl/lingua-go v1.4.0 h1:ifYhthrlW7iO4icdubwlduYnmwU37V1sbNrwhKBR4rM=
|
||||
github.com/pemistahl/lingua-go v1.4.0/go.mod h1:ECuM1Hp/3hvyh7k8aWSqNCPlTxLemFZsRjocUf3KgME=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/shopspring/decimal v1.3.1 h1:2Usl1nmF/WZucqkFZhnfFYxxxu8LG21F6nPQBE5gKV8=
|
||||
github.com/shopspring/decimal v1.3.1/go.mod h1:DKyhrW/HYNuLGql+MJL6WCR6knT2jwCFRcu2hWCYk4o=
|
||||
github.com/stretchr/testify v1.8.4 h1:CcVxjf3Q8PM0mHUKJCdn+eZZtm5yQwehR5yeSVQQcUk=
|
||||
github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo=
|
||||
github.com/yanyiwu/gojieba v1.4.4 h1:Iukkf8WlIfqAKtsGZjUhGR1ArKa7DtLDNmW8bvUI8JI=
|
||||
github.com/yanyiwu/gojieba v1.4.4/go.mod h1:JUq4DddFVGdHXJHxxepxRmhrKlDpaBxR8O28v6fKYLY=
|
||||
go.etcd.io/bbolt v1.3.9 h1:8x7aARPEXiXbHmtUwAIv7eV2fQFHrLLavdiJ3uzJXoI=
|
||||
go.etcd.io/bbolt v1.3.9/go.mod h1:zaO32+Ti0PK1ivdPtgMESzuzL2VPoIG1PCQNvOdo/dE=
|
||||
golang.org/x/exp v0.0.0-20221106115401-f9659909a136 h1:Fq7F/w7MAa1KJ5bt2aJ62ihqp9HDcRuyILskkpIAurw=
|
||||
golang.org/x/exp v0.0.0-20221106115401-f9659909a136/go.mod h1:CxIveKay+FTh1D0yPZemJVgC/95VzuuOLq5Qi4xnoYc=
|
||||
golang.org/x/net v0.23.0 h1:7EYJ93RZ9vYSZAIb2x3lnuvqO5zneoD6IvWjuhfxjTs=
|
||||
golang.org/x/net v0.23.0/go.mod h1:JKghWKKOSdJwpW2GEx0Ja7fmaKnMsbu+MWVZTokSYmg=
|
||||
golang.org/x/sync v0.5.0 h1:60k92dhOjHxJkrqnwsfl8KuaHbn/5dl0lUPUklKo3qE=
|
||||
golang.org/x/sync v0.5.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk=
|
||||
golang.org/x/sys v0.18.0 h1:DBdB3niSjOA/O0blCZBqDefyWNYveAYMNF1Wum0DYQ4=
|
||||
golang.org/x/sys v0.18.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
|
||||
golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ=
|
||||
golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU=
|
||||
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
|
||||
google.golang.org/protobuf v1.31.0 h1:g0LDEJHgrBl9N9r17Ru3sqWhkIx2NB67okBHPwC7hs8=
|
||||
google.golang.org/protobuf v1.31.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
||||
@@ -0,0 +1,327 @@
|
||||
// Package harvester implements the index-writing server (port 5000).
|
||||
//
|
||||
// It receives (url, keywords) payloads from the crawler, accumulates them in
|
||||
// memory, then flushes to the persistent inverted index when the in-memory
|
||||
// row count exceeds the configured threshold.
|
||||
package harvester
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
|
||||
"sese-engine/config"
|
||||
"sese-engine/info"
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
// Server is the harvester HTTP server.
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
|
||||
// in-memory accumulator: keyword → [(weight, url)]
|
||||
mem map[string][]storage.IndexEntry
|
||||
memMu sync.Mutex
|
||||
|
||||
rowCount int64 // approximate total in-memory rows
|
||||
flushMu sync.Mutex // only one flush at a time
|
||||
|
||||
infoSvc *info.Service
|
||||
}
|
||||
|
||||
// New creates a harvester Server.
|
||||
func New(db *storage.DB, infoSvc *info.Service) *Server {
|
||||
return &Server{
|
||||
db: db,
|
||||
mem: make(map[string][]storage.IndexEntry),
|
||||
infoSvc: infoSvc,
|
||||
}
|
||||
}
|
||||
|
||||
// ingestPayload is the JSON body sent by the crawler.
|
||||
type ingestPayload struct {
|
||||
URL string `json:"url"`
|
||||
Keywords []struct {
|
||||
Word string `json:"word"`
|
||||
Weight float32 `json:"weight"`
|
||||
} `json:"keywords"`
|
||||
}
|
||||
|
||||
// Handler returns the http.Handler for the harvester.
|
||||
func (s *Server) Handler() http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/l", s.handleIngest)
|
||||
return mux
|
||||
}
|
||||
|
||||
func (s *Server) handleIngest(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method != http.MethodPost {
|
||||
http.Error(w, "method not allowed", http.StatusMethodNotAllowed)
|
||||
return
|
||||
}
|
||||
var payload ingestPayload
|
||||
if err := json.NewDecoder(r.Body).Decode(&payload); err != nil {
|
||||
http.Error(w, "bad json: "+err.Error(), http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
// Sanitise URL
|
||||
payload.URL = strings.ReplaceAll(payload.URL, "\n", "")
|
||||
if payload.URL == "" {
|
||||
http.Error(w, "empty url", http.StatusBadRequest)
|
||||
return
|
||||
}
|
||||
|
||||
s.memMu.Lock()
|
||||
for _, kw := range payload.Keywords {
|
||||
key := kw.Word
|
||||
entries := s.mem[key]
|
||||
|
||||
// Threshold-based early discard
|
||||
if len(entries) > 15 {
|
||||
low := s.lowThreshold(key)
|
||||
if float64(kw.Weight) < low {
|
||||
continue
|
||||
}
|
||||
}
|
||||
s.mem[key] = append(entries, storage.IndexEntry{
|
||||
Weight: kw.Weight,
|
||||
URL: payload.URL,
|
||||
})
|
||||
atomic.AddInt64(&s.rowCount, 1)
|
||||
}
|
||||
s.memMu.Unlock()
|
||||
|
||||
// Check if we should flush
|
||||
if atomic.LoadInt64(&s.rowCount) > int64(config.BigCleanThreshold) {
|
||||
go s.flush()
|
||||
}
|
||||
|
||||
w.Write([]byte("ok"))
|
||||
}
|
||||
|
||||
// lowThreshold returns the minimum weight needed to enter the index for key.
|
||||
func (s *Server) lowThreshold(key string) float64 {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
if len(existing) < config.MaxURLsPerKey {
|
||||
return -1
|
||||
}
|
||||
// Find the config.MaxURLsPerKey-th highest weight
|
||||
weights := make([]float64, len(existing))
|
||||
for i, e := range existing {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
// Partial sort: find threshold at position MaxURLsPerKey-1
|
||||
return nthLargest(weights, config.MaxURLsPerKey-1) * 0.05
|
||||
}
|
||||
|
||||
// flush merges the in-memory accumulator into the persistent index.
|
||||
func (s *Server) flush() {
|
||||
if !s.flushMu.TryLock() {
|
||||
return // another flush is running
|
||||
}
|
||||
defer s.flushMu.Unlock()
|
||||
|
||||
s.memMu.Lock()
|
||||
snapshot := s.mem
|
||||
s.mem = make(map[string][]storage.IndexEntry)
|
||||
atomic.StoreInt64(&s.rowCount, 0)
|
||||
s.memMu.Unlock()
|
||||
|
||||
log.Printf("[harvester] flushing %d keys", len(snapshot))
|
||||
|
||||
items := make([]struct {
|
||||
key string
|
||||
entries []storage.IndexEntry
|
||||
}, 0, len(snapshot))
|
||||
for k, v := range snapshot {
|
||||
items = append(items, struct {
|
||||
key string
|
||||
entries []storage.IndexEntry
|
||||
}{k, v})
|
||||
}
|
||||
rand.Shuffle(len(items), func(i, j int) { items[i], items[j] = items[j], items[i] })
|
||||
|
||||
// Parallel merge
|
||||
type result struct {
|
||||
key string
|
||||
entries []storage.IndexEntry
|
||||
}
|
||||
results := make(chan result, len(items))
|
||||
sem := make(chan struct{}, 8)
|
||||
|
||||
for _, item := range items {
|
||||
sem <- struct{}{}
|
||||
go func(k string, newEntries []storage.IndexEntry) {
|
||||
defer func() { <-sem }()
|
||||
merged := s.mergeKey(k, newEntries)
|
||||
results <- result{k, merged}
|
||||
}(item.key, item.entries)
|
||||
}
|
||||
|
||||
// Collect
|
||||
batch := make(map[string][]storage.IndexEntry, len(items))
|
||||
for range items {
|
||||
r := <-results
|
||||
batch[r.key] = r.entries
|
||||
}
|
||||
|
||||
if err := s.db.BatchSetIndex(batch); err != nil {
|
||||
log.Printf("[harvester] flush write error: %v", err)
|
||||
}
|
||||
log.Printf("[harvester] flush done, %d keys written", len(batch))
|
||||
}
|
||||
|
||||
// mergeKey merges new entries with existing index entries for a key.
|
||||
func (s *Server) mergeKey(key string, newEntries []storage.IndexEntry) []storage.IndexEntry {
|
||||
existing, _ := s.db.GetIndex(key)
|
||||
|
||||
// Discard new key if too few URLs
|
||||
if len(existing) == 0 && len(newEntries) < config.MinURLsForNewKey {
|
||||
return nil
|
||||
}
|
||||
|
||||
merged := dedup(append(newEntries, existing...))
|
||||
|
||||
// Occasional URL normalisation dedup
|
||||
if rand.Float64() < 0.02 {
|
||||
merged = dedupNormalised(merged)
|
||||
}
|
||||
|
||||
// Trim if over limit
|
||||
if float64(len(merged)) > float64(config.MaxURLsPerKey)*1.1 || rand.Float64() < 0.02 {
|
||||
merged = trim(merged, s.infoSvc, config.MaxURLsPerKey, config.MaxSameDomainPerKey)
|
||||
}
|
||||
|
||||
return merged
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
|
||||
func dedup(entries []storage.IndexEntry) []storage.IndexEntry {
|
||||
seen := make(map[string]bool, len(entries))
|
||||
out := make([]storage.IndexEntry, 0, len(entries))
|
||||
for _, e := range entries {
|
||||
if seen[e.URL] {
|
||||
continue
|
||||
}
|
||||
seen[e.URL] = true
|
||||
out = append(out, e)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func dedupNormalised(entries []storage.IndexEntry) []storage.IndexEntry {
|
||||
// Sort by URL length descending, then dedup by normalised URL (strip scheme, trailing slash)
|
||||
sorted := make([]storage.IndexEntry, len(entries))
|
||||
copy(sorted, entries)
|
||||
for i := 0; i < len(sorted)-1; i++ {
|
||||
for j := i + 1; j < len(sorted); j++ {
|
||||
if len(sorted[j].URL) > len(sorted[i].URL) {
|
||||
sorted[i], sorted[j] = sorted[j], sorted[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
seen := make(map[string]bool)
|
||||
out := make([]storage.IndexEntry, 0, len(sorted))
|
||||
for _, e := range sorted {
|
||||
k := normaliseURL(e.URL)
|
||||
if seen[k] {
|
||||
continue
|
||||
}
|
||||
seen[k] = true
|
||||
out = append(out, e)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func normaliseURL(u string) string {
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
u = u[8:]
|
||||
} else if strings.HasPrefix(u, "http://") {
|
||||
u = u[7:]
|
||||
}
|
||||
return strings.TrimRight(u, "/")
|
||||
}
|
||||
|
||||
// trim reduces entries to at most limit, keeping at most sameDomainLimit per domain.
|
||||
func trim(entries []storage.IndexEntry, infoSvc *info.Service, limit, sameDomainLimit int) []storage.IndexEntry {
|
||||
// Sort by effective score: weight * (1 + backlink)
|
||||
scored := make([]storage.IndexEntry, len(entries))
|
||||
copy(scored, entries)
|
||||
for i := 0; i < len(scored)-1; i++ {
|
||||
for j := i + 1; j < len(scored); j++ {
|
||||
si := float64(scored[i].Weight) * (1 + infoSvc.Prosper(scored[i].URL))
|
||||
sj := float64(scored[j].Weight) * (1 + infoSvc.Prosper(scored[j].URL))
|
||||
if sj > si {
|
||||
scored[i], scored[j] = scored[j], scored[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Per-domain cap
|
||||
domainCount := make(map[string]int)
|
||||
out := make([]storage.IndexEntry, 0, limit)
|
||||
for _, e := range scored {
|
||||
host := netloc(e.URL)
|
||||
if host == "" {
|
||||
host = e.URL
|
||||
}
|
||||
host = strings.ToLower(host)
|
||||
// Allow homepage URLs regardless of limit
|
||||
isHome := isHomepage(e.URL)
|
||||
if !isHome && domainCount[host] >= sameDomainLimit {
|
||||
continue
|
||||
}
|
||||
domainCount[host]++
|
||||
out = append(out, e)
|
||||
if len(out) >= limit {
|
||||
break
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func isHomepage(u string) bool {
|
||||
u = strings.TrimPrefix(u, "https://")
|
||||
u = strings.TrimPrefix(u, "http://")
|
||||
return strings.Count(strings.TrimRight(u, "/"), "/") == 0
|
||||
}
|
||||
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
return parts[2]
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// nthLargest returns the n-th largest value in a slice (0-indexed).
|
||||
func nthLargest(values []float64, n int) float64 {
|
||||
if n >= len(values) {
|
||||
return 0
|
||||
}
|
||||
cp := make([]float64, len(values))
|
||||
copy(cp, values)
|
||||
// Partial sort descending
|
||||
for i := 0; i <= n; i++ {
|
||||
maxIdx := i
|
||||
for j := i + 1; j < len(cp); j++ {
|
||||
if cp[j] > cp[maxIdx] {
|
||||
maxIdx = j
|
||||
}
|
||||
}
|
||||
cp[i], cp[maxIdx] = cp[maxIdx], cp[i]
|
||||
}
|
||||
return cp[n]
|
||||
}
|
||||
|
||||
// ListenAndServe starts the harvester on the given address.
|
||||
func (s *Server) ListenAndServe(addr string) error {
|
||||
log.Printf("[harvester] listening on %s", addr)
|
||||
return http.ListenAndServe(addr, s.Handler())
|
||||
}
|
||||
+206
@@ -0,0 +1,206 @@
|
||||
// Package info loads and serves auxiliary data: backlink scores, adjustment
|
||||
// table, and blocked query words.
|
||||
package info
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
)
|
||||
|
||||
// Service loads the prosperity map, adjustment table, and blocked words.
|
||||
type Service struct {
|
||||
mu sync.RWMutex
|
||||
prosperMap map[string]float64 // normalised backlink scores
|
||||
adjustTable map[string]float64 // per-domain manual weight adjustments
|
||||
blockedWords map[string]bool
|
||||
storagePath string
|
||||
}
|
||||
|
||||
// New creates and loads the info service from storagePath.
|
||||
func New(storagePath string) *Service {
|
||||
s := &Service{storagePath: storagePath}
|
||||
s.Reload()
|
||||
return s
|
||||
}
|
||||
|
||||
// Reload re-reads all data files from disk.
|
||||
func (s *Service) Reload() {
|
||||
s.mu.Lock()
|
||||
defer s.mu.Unlock()
|
||||
s.prosperMap = loadProsperMap(s.storagePath)
|
||||
s.adjustTable = loadAdjustTable()
|
||||
s.blockedWords = loadBlockedWords()
|
||||
}
|
||||
|
||||
// Prosper returns the backlink score for a URL (sum of its path components).
|
||||
func (s *Service) Prosper(rawURL string) float64 {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return prosperFor(rawURL, s.prosperMap)
|
||||
}
|
||||
|
||||
// ProsperMap returns the full prosperity map (read-only snapshot).
|
||||
func (s *Service) ProsperMap() map[string]float64 {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
out := make(map[string]float64, len(s.prosperMap))
|
||||
for k, v := range s.prosperMap {
|
||||
out[k] = v
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// Adjust returns the manual weight multiplier for a hostname (default 1.0).
|
||||
func (s *Service) Adjust(host string) float64 {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
if v, ok := s.adjustTable[host]; ok {
|
||||
return v
|
||||
}
|
||||
return 1.0
|
||||
}
|
||||
|
||||
// IsBlocked returns true if the word is in the blocked list.
|
||||
func (s *Service) IsBlocked(word string) bool {
|
||||
s.mu.RLock()
|
||||
defer s.mu.RUnlock()
|
||||
return s.blockedWords[word]
|
||||
}
|
||||
|
||||
// ---- loaders ----
|
||||
|
||||
const backlinkBaseline = 200000.0
|
||||
|
||||
func loadProsperMap(storagePath string) map[string]float64 {
|
||||
path := filepath.Join(storagePath, "prosper.json")
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return map[string]float64{}
|
||||
}
|
||||
defer f.Close()
|
||||
var raw map[string]float64
|
||||
if err := json.NewDecoder(f).Decode(&raw); err != nil {
|
||||
return map[string]float64{}
|
||||
}
|
||||
return normalise(raw)
|
||||
}
|
||||
|
||||
func normalise(d map[string]float64) map[string]float64 {
|
||||
total := 0.0
|
||||
for k, v := range d {
|
||||
if !strings.Contains(k, "/") {
|
||||
total += v
|
||||
}
|
||||
}
|
||||
if total == 0 {
|
||||
return d
|
||||
}
|
||||
factor := backlinkBaseline / total
|
||||
out := make(map[string]float64, len(d))
|
||||
for k, v := range d {
|
||||
out[k] = v * factor
|
||||
}
|
||||
// Propagate max score up the domain tree
|
||||
for k, v := range out {
|
||||
now := k
|
||||
for {
|
||||
idx := strings.Index(now, ".")
|
||||
if idx < 0 {
|
||||
break
|
||||
}
|
||||
now = now[idx+1:]
|
||||
if cur, ok := out[now]; ok && cur < v {
|
||||
out[now] = v
|
||||
} else if !ok {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func loadAdjustTable() map[string]float64 {
|
||||
// Try loading from data/adjust.json — fallback if absent
|
||||
f, err := os.Open(filepath.Join("data", "adjust.json"))
|
||||
if err != nil {
|
||||
return map[string]float64{}
|
||||
}
|
||||
defer f.Close()
|
||||
var m map[string]float64
|
||||
json.NewDecoder(f).Decode(&m)
|
||||
return m
|
||||
}
|
||||
|
||||
func loadBlockedWords() map[string]bool {
|
||||
f, err := os.Open(filepath.Join("data", "blocked_words.json"))
|
||||
if err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
defer f.Close()
|
||||
var words []string
|
||||
json.NewDecoder(f).Decode(&words)
|
||||
m := make(map[string]bool, len(words))
|
||||
for _, w := range words {
|
||||
m[w] = true
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// prosperFor computes the prosperity score for a URL by decomposing it.
|
||||
func prosperFor(rawURL string, pm map[string]float64) float64 {
|
||||
segments := decomposeURL(rawURL)
|
||||
s := 0.0
|
||||
for _, seg := range segments {
|
||||
t, ok := pm[seg]
|
||||
if !ok {
|
||||
t = 0
|
||||
}
|
||||
l := 0.0
|
||||
if t > 0 {
|
||||
l = math.Log2(2+t*2) - 1
|
||||
}
|
||||
if s == 0 {
|
||||
if l == 0 {
|
||||
return 0
|
||||
}
|
||||
s = l
|
||||
} else {
|
||||
s = l + math.Log((s-l)/2+1)
|
||||
}
|
||||
}
|
||||
if s > 0 {
|
||||
return 0.1 + s
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
||||
// decomposeURL yields "domain.tld", "domain.tld/path", "domain.tld/path/sub", ...
|
||||
func decomposeURL(rawURL string) []string {
|
||||
u := strings.ToLower(rawURL)
|
||||
if strings.HasPrefix(u, "https://") {
|
||||
u = u[8:]
|
||||
} else if strings.HasPrefix(u, "http://") {
|
||||
u = u[7:]
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
u = strings.ReplaceAll(u, "?", "/")
|
||||
u = strings.ReplaceAll(u, "#", "/")
|
||||
u = strings.TrimRight(u, "/")
|
||||
if u == "" || u[0] == '/' || u[0] == '%' || u[0] == ' ' {
|
||||
return nil
|
||||
}
|
||||
parts := strings.Split(u, "/")
|
||||
var out []string
|
||||
current := parts[0]
|
||||
out = append(out, current)
|
||||
for _, p := range parts[1:] {
|
||||
current = current + "/" + p
|
||||
out = append(out, current)
|
||||
}
|
||||
return out
|
||||
}
|
||||
@@ -0,0 +1,90 @@
|
||||
// sese-engine — Go rewrite
|
||||
//
|
||||
// All modules (harvester, search server, crawler, backlink calculator) are
|
||||
// launched as goroutines from this single binary. The binary blocks until
|
||||
// interrupted (Ctrl-C / SIGTERM).
|
||||
//
|
||||
// Usage:
|
||||
//
|
||||
// cd golang && go run . [--storage ./savedata] [--entry https://zh.wikipedia.org/]
|
||||
package main
|
||||
|
||||
import (
|
||||
"flag"
|
||||
"fmt"
|
||||
"log"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"sese-engine/analyzer"
|
||||
"sese-engine/backlink"
|
||||
"sese-engine/config"
|
||||
"sese-engine/crawler"
|
||||
"sese-engine/harvester"
|
||||
"sese-engine/info"
|
||||
"sese-engine/search"
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
func main() {
|
||||
storageDir := flag.String("storage", config.StoragePath, "path to savedata directory")
|
||||
entryURL := flag.String("entry", config.EntryURL, "BFS crawl entry URL")
|
||||
stopWords := flag.String("stopwords", "../data/标点符号.json", "path to stop-words JSON")
|
||||
flag.Parse()
|
||||
|
||||
log.SetFlags(log.LstdFlags | log.Lshortfile)
|
||||
log.Printf("sese-engine starting storage=%s entry=%s", *storageDir, *entryURL)
|
||||
|
||||
// ---- 1. Storage ----
|
||||
db, err := storage.Open(*storageDir)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to open storage: %v", err)
|
||||
}
|
||||
defer db.Close()
|
||||
|
||||
// ---- 2. Info service ----
|
||||
infoSvc := info.New(*storageDir)
|
||||
|
||||
// ---- 3. Analyzer ----
|
||||
// modelPath is unused (lingua-go uses built-in language models, no external file needed)
|
||||
anal, err := analyzer.New("", *stopWords)
|
||||
if err != nil {
|
||||
log.Fatalf("failed to init analyzer: %v", err)
|
||||
}
|
||||
defer anal.Close()
|
||||
|
||||
// ---- 4. Harvester (index write server on :5000) ----
|
||||
harvSrv := harvester.New(db, infoSvc)
|
||||
go func() {
|
||||
if err := harvSrv.ListenAndServe(":5000"); err != nil {
|
||||
log.Fatalf("[harvester] fatal: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// ---- 5. Search server ----
|
||||
searchSrv := search.New(db, infoSvc, anal)
|
||||
go func() {
|
||||
addr := fmt.Sprintf(":%d", config.SearchServerPort)
|
||||
if err := searchSrv.ListenAndServe(addr); err != nil {
|
||||
log.Fatalf("[search] fatal: %v", err)
|
||||
}
|
||||
}()
|
||||
|
||||
// ---- 6. Backlink calculator (runs every 48 h) ----
|
||||
bl := backlink.New(db, *storageDir)
|
||||
go bl.Run()
|
||||
|
||||
// ---- 7. Crawler ----
|
||||
prosperMap := infoSvc.ProsperMap()
|
||||
crawl := crawler.New(db, anal, prosperMap)
|
||||
go crawl.Run(*entryURL, config.MaxEpoch)
|
||||
|
||||
log.Println("all modules started — press Ctrl-C to stop")
|
||||
|
||||
// ---- Graceful shutdown ----
|
||||
quit := make(chan os.Signal, 1)
|
||||
signal.Notify(quit, os.Interrupt, syscall.SIGTERM)
|
||||
<-quit
|
||||
log.Println("shutdown signal received, exiting...")
|
||||
}
|
||||
@@ -0,0 +1,153 @@
|
||||
// Package parser extracts title, description, text content, and links from HTML.
|
||||
package parser
|
||||
|
||||
import (
|
||||
"path"
|
||||
"regexp"
|
||||
"strings"
|
||||
|
||||
"golang.org/x/net/html"
|
||||
)
|
||||
|
||||
var wsRe = regexp.MustCompile(`\s+`)
|
||||
|
||||
// ParseHTML parses an HTML document and returns title, meta description, body text, and href list.
|
||||
func ParseHTML(body, baseURL string) (title, description, text string, hrefs []string) {
|
||||
// Determine base scheme+host
|
||||
base := baseFromURL(baseURL)
|
||||
basePath := pathFromURL(baseURL)
|
||||
|
||||
doc, err := html.Parse(strings.NewReader(body))
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
var textParts []string
|
||||
|
||||
var dfs func(n *html.Node)
|
||||
dfs = func(n *html.Node) {
|
||||
if n.Type == html.ElementNode {
|
||||
tag := strings.ToLower(n.Data)
|
||||
if tag == "script" || tag == "style" || tag == "svg" {
|
||||
return
|
||||
}
|
||||
if tag == "meta" {
|
||||
name := ""
|
||||
content := ""
|
||||
for _, a := range n.Attr {
|
||||
switch strings.ToLower(a.Key) {
|
||||
case "name":
|
||||
name = strings.ToLower(a.Val)
|
||||
case "content":
|
||||
content = a.Val
|
||||
}
|
||||
}
|
||||
if name == "description" && description == "" {
|
||||
description = content
|
||||
}
|
||||
}
|
||||
if tag == "a" {
|
||||
href := attrVal(n, "href")
|
||||
if href != "" {
|
||||
href = strings.SplitN(href, "#", 2)[0]
|
||||
if href != "" {
|
||||
href = resolveURL(base, basePath, href)
|
||||
if href != "" {
|
||||
hrefs = append(hrefs, href)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if n.Type == html.TextNode && n.Parent != nil {
|
||||
parentTag := ""
|
||||
if n.Parent.Type == html.ElementNode {
|
||||
parentTag = strings.ToLower(n.Parent.Data)
|
||||
}
|
||||
if parentTag == "script" || parentTag == "style" || parentTag == "svg" {
|
||||
goto children
|
||||
}
|
||||
s := wsRe.ReplaceAllString(n.Data, " ")
|
||||
s = strings.TrimSpace(s)
|
||||
if s != "" {
|
||||
if parentTag == "title" {
|
||||
title = s
|
||||
} else {
|
||||
textParts = append(textParts, s)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
children:
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
dfs(c)
|
||||
}
|
||||
}
|
||||
dfs(doc)
|
||||
|
||||
text = strings.Join(textParts, " ")
|
||||
return
|
||||
}
|
||||
|
||||
func attrVal(n *html.Node, key string) string {
|
||||
for _, a := range n.Attr {
|
||||
if strings.ToLower(a.Key) == key {
|
||||
return a.Val
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func baseFromURL(rawURL string) string {
|
||||
idx := strings.Index(rawURL, "://")
|
||||
if idx < 0 {
|
||||
return ""
|
||||
}
|
||||
rest := rawURL[idx+3:]
|
||||
slash := strings.Index(rest, "/")
|
||||
if slash < 0 {
|
||||
return rawURL
|
||||
}
|
||||
return rawURL[:idx+3+slash]
|
||||
}
|
||||
|
||||
func pathFromURL(rawURL string) string {
|
||||
idx := strings.Index(rawURL, "://")
|
||||
if idx < 0 {
|
||||
return "/"
|
||||
}
|
||||
rest := rawURL[idx+3:]
|
||||
slash := strings.Index(rest, "/")
|
||||
if slash < 0 {
|
||||
return "/"
|
||||
}
|
||||
p := rest[slash:]
|
||||
// strip query/fragment
|
||||
p = strings.SplitN(p, "?", 2)[0]
|
||||
p = strings.SplitN(p, "#", 2)[0]
|
||||
return p
|
||||
}
|
||||
|
||||
func resolveURL(base, basePath, href string) string {
|
||||
// Absolute URL
|
||||
if strings.HasPrefix(href, "http://") || strings.HasPrefix(href, "https://") {
|
||||
return href
|
||||
}
|
||||
// Protocol-relative
|
||||
if strings.HasPrefix(href, "//") {
|
||||
// extract scheme from base
|
||||
idx := strings.Index(base, "://")
|
||||
if idx < 0 {
|
||||
return ""
|
||||
}
|
||||
return base[:idx+1] + href
|
||||
}
|
||||
// Absolute path
|
||||
if strings.HasPrefix(href, "/") {
|
||||
return base + href
|
||||
}
|
||||
// Relative path
|
||||
dir := path.Dir(basePath)
|
||||
return base + path.Clean(dir+"/"+href)
|
||||
}
|
||||
@@ -0,0 +1,693 @@
|
||||
// Package search implements the user-facing search HTTP server.
|
||||
package search
|
||||
|
||||
import (
|
||||
"container/heap"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"sese-engine/analyzer"
|
||||
"sese-engine/config"
|
||||
"sese-engine/info"
|
||||
"sese-engine/parser"
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
// Server is the search HTTP server.
|
||||
type Server struct {
|
||||
db *storage.DB
|
||||
infoSvc *info.Service
|
||||
analyzer *analyzer.Analyzer
|
||||
httpCli *http.Client // for online snippet fetching
|
||||
}
|
||||
|
||||
// New creates a search Server.
|
||||
func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
|
||||
return &Server{
|
||||
db: db,
|
||||
infoSvc: infoSvc,
|
||||
analyzer: a,
|
||||
httpCli: &http.Client{
|
||||
Timeout: time.Duration(config.OnlineSnippetTimeout) * time.Second,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// Handler returns the http.Handler.
|
||||
func (s *Server) Handler() http.Handler {
|
||||
mux := http.NewServeMux()
|
||||
mux.HandleFunc("/search", s.handleSearch)
|
||||
return mux
|
||||
}
|
||||
|
||||
// ListenAndServe starts the search server.
|
||||
func (s *Server) ListenAndServe(addr string) error {
|
||||
log.Printf("[search] listening on %s", addr)
|
||||
return http.ListenAndServe(addr, s.Handler())
|
||||
}
|
||||
|
||||
// ---- search handler ----
|
||||
|
||||
type searchResponse struct {
|
||||
Tokens []string `json:"tokens"`
|
||||
Counts map[string]int `json:"counts"`
|
||||
Results []searchResult `json:"results"`
|
||||
Total int `json:"total"`
|
||||
}
|
||||
|
||||
type searchResult struct {
|
||||
Score float64 `json:"score"`
|
||||
URL string `json:"url"`
|
||||
Snippet *snippetInfo `json:"snippet,omitempty"`
|
||||
Relevance map[string]float64 `json:"relevance"`
|
||||
DomainCount int `json:"domain_count"`
|
||||
Factors map[string]float64 `json:"factors,omitempty"`
|
||||
}
|
||||
|
||||
type snippetInfo struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"description"`
|
||||
Text string `json:"text"`
|
||||
}
|
||||
|
||||
var siteRe = regexp.MustCompile(`^site:(.+)$`)
|
||||
|
||||
func (s *Server) handleSearch(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Access-Control-Allow-Origin", "*")
|
||||
w.Header().Set("Content-Type", "application/json; charset=utf-8")
|
||||
|
||||
q := r.URL.Query().Get("q")
|
||||
if q == "" {
|
||||
if qh := r.URL.Query().Get("qh"); qh != "" {
|
||||
decoded, err := url.PathUnescape(qh)
|
||||
if err == nil {
|
||||
q = decoded
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse slice param "0:10"
|
||||
sliceStr := r.URL.Query().Get("slice")
|
||||
sliceFrom, sliceTo := 0, 10
|
||||
if sliceStr != "" {
|
||||
parts := strings.SplitN(sliceStr, ":", 2)
|
||||
if len(parts) == 2 {
|
||||
a := atoi(parts[0])
|
||||
b := atoi(parts[1])
|
||||
if a >= 0 && b > a && b-a <= 20 {
|
||||
sliceFrom, sliceTo = a, b
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Parse tokens and site filter
|
||||
var tokens []string
|
||||
var siteFilter string
|
||||
for _, part := range strings.Fields(q) {
|
||||
if m := siteRe.FindStringSubmatch(part); len(m) > 1 {
|
||||
siteFilter = m[1]
|
||||
} else {
|
||||
segs := s.analyzer.Segment(part, false)
|
||||
for _, t := range segs {
|
||||
if !s.infoSvc.IsBlocked(t) {
|
||||
tokens = append(tokens, t)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if len(tokens) > 20 {
|
||||
tokens = tokens[:20]
|
||||
}
|
||||
|
||||
results, total := s.query(tokens, sliceFrom, sliceTo, siteFilter)
|
||||
|
||||
// Count per keyword
|
||||
counts := make(map[string]int, len(tokens))
|
||||
for _, t := range tokens {
|
||||
entries, _ := s.db.GetIndex(t)
|
||||
counts[t] = len(entries)
|
||||
}
|
||||
|
||||
resp := searchResponse{
|
||||
Tokens: tokens,
|
||||
Counts: counts,
|
||||
Results: results,
|
||||
Total: total,
|
||||
}
|
||||
json.NewEncoder(w).Encode(resp)
|
||||
}
|
||||
|
||||
// query executes the multi-keyword search and returns ranked results.
|
||||
func (s *Server) query(tokens []string, from, to int, siteFilter string) ([]searchResult, int) {
|
||||
if len(tokens) == 0 {
|
||||
return nil, 0
|
||||
}
|
||||
|
||||
// Load inverted index for each token
|
||||
type tokenIndex struct {
|
||||
token string
|
||||
entries []storage.IndexEntry
|
||||
defVal float64
|
||||
}
|
||||
tokenIndexes := make([]tokenIndex, 0, len(tokens))
|
||||
for _, t := range tokens {
|
||||
entries, _ := s.db.GetIndex(t)
|
||||
defVal := 1.0 / 10000 * float64(max(100, len(entries))) / float64(config.MaxURLsPerKey)
|
||||
if len(entries) >= config.MaxURLsPerKey {
|
||||
weights := make([]float64, len(entries))
|
||||
for i, e := range entries {
|
||||
weights[i] = float64(e.Weight)
|
||||
}
|
||||
sort.Sort(sort.Reverse(sort.Float64Slice(weights)))
|
||||
defVal = math.Max(1.0/10000, weights[config.MaxURLsPerKey-1]/2)
|
||||
}
|
||||
tokenIndexes = append(tokenIndexes, tokenIndex{t, entries, defVal})
|
||||
}
|
||||
|
||||
// Build URL → per-token weights map
|
||||
urlWeights := make(map[string]map[string]float64)
|
||||
for _, ti := range tokenIndexes {
|
||||
for _, e := range ti.entries {
|
||||
if urlWeights[e.URL] == nil {
|
||||
urlWeights[e.URL] = make(map[string]float64)
|
||||
}
|
||||
urlWeights[e.URL][ti.token] = float64(e.Weight)
|
||||
}
|
||||
}
|
||||
|
||||
// Site filter
|
||||
total := len(urlWeights)
|
||||
if siteFilter != "" {
|
||||
filtered := make(map[string]map[string]float64)
|
||||
for u, vs := range urlWeights {
|
||||
h := netloc(u)
|
||||
if matchSite(h, siteFilter) {
|
||||
filtered[u] = vs
|
||||
}
|
||||
}
|
||||
urlWeights = filtered
|
||||
total = len(urlWeights)
|
||||
}
|
||||
|
||||
// Build default value map
|
||||
defVals := make(map[string]float64, len(tokenIndexes))
|
||||
for _, ti := range tokenIndexes {
|
||||
defVals[ti.token] = ti.defVal
|
||||
}
|
||||
|
||||
// Compute relevance + initial score for each URL
|
||||
candidates := make([]candidate, 0, len(urlWeights))
|
||||
for u, vs := range urlWeights {
|
||||
rel := 1.0
|
||||
for _, ti := range tokenIndexes {
|
||||
vp := vs[ti.token]
|
||||
if vp == 0 {
|
||||
vp = defVals[ti.token]
|
||||
}
|
||||
if vp > 0.06 {
|
||||
vp = math.Log((vp-0.06)*40+1)/40 + 0.06
|
||||
}
|
||||
rel *= vp
|
||||
}
|
||||
prosper := 1 + s.infoSvc.Prosper(u)*config.BacklinkWeight
|
||||
bad := badURL(u)
|
||||
adjust := s.infoSvc.Adjust(netloc(u))
|
||||
score := rel * prosper * (1 - bad) * adjust * 0.1
|
||||
|
||||
var vec [12]float64
|
||||
vec[0] = score
|
||||
vec[1] = rel
|
||||
vec[2] = prosper
|
||||
vec[3] = 1 - bad
|
||||
vec[4] = 1 // language multiplier placeholder
|
||||
vec[5] = 1 // repetition placeholder
|
||||
vec[6] = adjust
|
||||
vec[7] = 1 // time multiplier placeholder
|
||||
vec[8] = 1 // consecutive keyword placeholder
|
||||
vec[9] = 1 // keyword content placeholder
|
||||
vec[10] = 1 // URL time placeholder
|
||||
vec[11] = 0.1
|
||||
|
||||
candidates = append(candidates, candidate{u, rel, vec})
|
||||
}
|
||||
|
||||
// Early relevance threshold
|
||||
sort.Slice(candidates, func(i, j int) bool {
|
||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||
})
|
||||
|
||||
// Apply site info factors to top 256
|
||||
now := time.Now().Unix()
|
||||
limit256 := 256
|
||||
if len(candidates) < 256 {
|
||||
limit256 = len(candidates)
|
||||
}
|
||||
|
||||
var wg sync.WaitGroup
|
||||
for i := 0; i < limit256; i++ {
|
||||
wg.Add(1)
|
||||
go func(idx int) {
|
||||
defer wg.Done()
|
||||
c := &candidates[idx]
|
||||
h := netloc(c.url)
|
||||
siteInfo, _ := s.db.GetSiteInfo(h)
|
||||
langMul := languageMultiplier(siteInfo)
|
||||
timeMul := timeMul(siteInfo, now)
|
||||
urlTimeMul := urlTimeMul(s.db, c.url, now)
|
||||
|
||||
c.scoreVec[0] = c.scoreVec[0] * 10 * langMul * timeMul * urlTimeMul
|
||||
c.scoreVec[4] = langMul
|
||||
c.scoreVec[7] = timeMul
|
||||
c.scoreVec[10] = urlTimeMul
|
||||
}(i)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
sort.Slice(candidates, func(i, j int) bool {
|
||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||
})
|
||||
|
||||
// Apply consecutive-keyword and repetition bonuses to top 80
|
||||
limit80 := 80
|
||||
if len(candidates) < 80 {
|
||||
limit80 = len(candidates)
|
||||
}
|
||||
|
||||
titles := make([]string, limit80)
|
||||
for i := 0; i < limit80; i++ {
|
||||
if snippet, err := s.db.GetSnippet(candidates[i].url); err == nil {
|
||||
titles[i] = snippet.Title
|
||||
}
|
||||
}
|
||||
|
||||
// Repetition penaliser
|
||||
for i := 0; i < limit80; i++ {
|
||||
h := repetitionSimilarity(titles, i)
|
||||
consecutive := consecutiveCount(titles[i], tokens)
|
||||
repMul := 1.0
|
||||
if h > 0.5 {
|
||||
repMul = 1 - (h - 0.5)
|
||||
}
|
||||
consMul := math.Pow(config.ConsecutiveKeyWeight, float64(consecutive))
|
||||
candidates[i].scoreVec[0] *= repMul * consMul
|
||||
candidates[i].scoreVec[5] = repMul
|
||||
candidates[i].scoreVec[8] = consMul
|
||||
}
|
||||
|
||||
sort.Slice(candidates, func(i, j int) bool {
|
||||
return candidates[i].scoreVec[0] > candidates[j].scoreVec[0]
|
||||
})
|
||||
|
||||
// Re-rank: interleave domains
|
||||
reranked := rerank(candidates, from, to)
|
||||
|
||||
// Fetch snippets and build output
|
||||
results := make([]searchResult, 0, len(reranked))
|
||||
var snippetMu sync.Mutex
|
||||
var snippetWg sync.WaitGroup
|
||||
|
||||
for _, c := range reranked {
|
||||
snippetWg.Add(1)
|
||||
go func(cand candidate) {
|
||||
defer snippetWg.Done()
|
||||
snip := s.getSnippet(cand.url)
|
||||
r := searchResult{
|
||||
Score: cand.scoreVec[0],
|
||||
URL: unescapeURL(cand.url),
|
||||
Snippet: snip,
|
||||
Relevance: make(map[string]float64),
|
||||
DomainCount: 0,
|
||||
Factors: map[string]float64{
|
||||
"relevance": cand.scoreVec[1],
|
||||
"backlink": cand.scoreVec[2],
|
||||
"url_quality": cand.scoreVec[3],
|
||||
"language": cand.scoreVec[4],
|
||||
"repetition": cand.scoreVec[5],
|
||||
"adjust": cand.scoreVec[6],
|
||||
"site_time": cand.scoreVec[7],
|
||||
"consecutive": cand.scoreVec[8],
|
||||
"url_time": cand.scoreVec[10],
|
||||
},
|
||||
}
|
||||
for _, ti := range tokenIndexes {
|
||||
r.Relevance[ti.token] = urlWeights[cand.url][ti.token]
|
||||
}
|
||||
snippetMu.Lock()
|
||||
results = append(results, r)
|
||||
snippetMu.Unlock()
|
||||
}(c)
|
||||
}
|
||||
snippetWg.Wait()
|
||||
|
||||
// Preserve order (goroutines may reorder)
|
||||
urlOrder := make(map[string]int)
|
||||
for i, c := range reranked {
|
||||
urlOrder[c.url] = i
|
||||
}
|
||||
sort.Slice(results, func(i, j int) bool {
|
||||
return urlOrder[results[i].URL] < urlOrder[results[j].URL]
|
||||
})
|
||||
|
||||
return results, total
|
||||
}
|
||||
|
||||
// getSnippet fetches (or caches) a snippet for a URL.
|
||||
func (s *Server) getSnippet(rawURL string) *snippetInfo {
|
||||
// Try cache first
|
||||
if entry, err := s.db.GetSnippet(rawURL); err == nil {
|
||||
snip := buildSnippet(entry)
|
||||
return snip
|
||||
}
|
||||
if !config.UseOnlineSnippet {
|
||||
return nil
|
||||
}
|
||||
// Fetch online with a simple HTTP client (no robots.txt check for search snippets)
|
||||
req, err := http.NewRequest("GET", rawURL, nil)
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
req.Header.Set("User-Agent", config.SpiderName)
|
||||
resp, err := s.httpCli.Do(req)
|
||||
if err != nil || resp.StatusCode != 200 {
|
||||
return nil
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
ct := resp.Header.Get("Content-Type")
|
||||
if !strings.Contains(ct, "text/html") {
|
||||
return nil
|
||||
}
|
||||
body := readBodyLimited(resp, 60000)
|
||||
title, desc, text, _ := parser.ParseHTML(body, resp.Request.URL.String())
|
||||
entry := &storage.SnippetEntry{
|
||||
Title: title,
|
||||
Description: truncate(desc, 256),
|
||||
Text: truncate(text, 256),
|
||||
Timestamp: time.Now().Unix(),
|
||||
}
|
||||
_ = s.db.SetSnippet(rawURL, entry)
|
||||
return buildSnippet(entry)
|
||||
}
|
||||
|
||||
func buildSnippet(entry *storage.SnippetEntry) *snippetInfo {
|
||||
if entry == nil || (entry.Title == "" && entry.Description == "" && entry.Text == "") {
|
||||
return nil
|
||||
}
|
||||
return &snippetInfo{
|
||||
Title: entry.Title,
|
||||
Description: entry.Description,
|
||||
Text: entry.Text,
|
||||
}
|
||||
}
|
||||
|
||||
// ---- scoring helpers ----
|
||||
|
||||
func languageMultiplier(si *storage.SiteInfo) float64 {
|
||||
if si == nil || len(si.Languages) == 0 {
|
||||
return 1.0
|
||||
}
|
||||
total := 0.0
|
||||
for _, v := range si.Languages {
|
||||
total += v
|
||||
}
|
||||
chinese := si.Languages["zh"] / total
|
||||
weird := (total - si.Languages["zh"] - si.Languages["en"] - si.Languages["ja"]) / total
|
||||
return 1 + chinese*config.LanguageWeight - weird*config.LanguageWeight
|
||||
}
|
||||
|
||||
func timeMul(si *storage.SiteInfo, now int64) float64 {
|
||||
if si == nil {
|
||||
return 1.0
|
||||
}
|
||||
t := si.LastVisitTime
|
||||
if t == 0 {
|
||||
t = 1648000000
|
||||
}
|
||||
days := (now - t) / (3600 * 24)
|
||||
if days < 0 {
|
||||
days = 0
|
||||
}
|
||||
if days > 180 {
|
||||
days = 180
|
||||
}
|
||||
if days > 0 {
|
||||
days--
|
||||
}
|
||||
return math.Pow(config.WeightDailyDecay, float64(days))
|
||||
}
|
||||
|
||||
func urlTimeMul(db *storage.DB, rawURL string, now int64) float64 {
|
||||
entry, err := db.GetSnippet(rawURL)
|
||||
if err != nil || entry == nil {
|
||||
return 1.0
|
||||
}
|
||||
days := (now - entry.Timestamp) / (3600 * 24)
|
||||
if days <= 30 {
|
||||
return 1.0
|
||||
}
|
||||
return math.Pow((2+config.WeightDailyDecay)/3, float64(days))
|
||||
}
|
||||
|
||||
func badURL(u string) float64 {
|
||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
||||
s += (1 - s) * 0.1
|
||||
}
|
||||
if len(u) < 5 || u[4] == ':' {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
return math.Min(s, 0.9)
|
||||
}
|
||||
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
return parts[2]
|
||||
}
|
||||
return rawURL
|
||||
}
|
||||
|
||||
func matchSite(host, pattern string) bool {
|
||||
if host == pattern {
|
||||
return true
|
||||
}
|
||||
if strings.HasSuffix(host, "."+pattern) {
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func consecutiveCount(title string, tokens []string) int {
|
||||
c := 0
|
||||
for i := 0; i < len(tokens)-1; i++ {
|
||||
if strings.Contains(title, tokens[i]+tokens[i+1]) {
|
||||
c++
|
||||
}
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
func repetitionSimilarity(titles []string, idx int) float64 {
|
||||
if idx == 0 {
|
||||
return 0
|
||||
}
|
||||
t := titles[idx]
|
||||
if t == "" {
|
||||
return 0
|
||||
}
|
||||
best := 0.0
|
||||
for _, prev := range titles[:idx] {
|
||||
if prev == "" {
|
||||
continue
|
||||
}
|
||||
sim := 1 - float64(levenshtein(t, prev))/float64(max(len(t), len(prev)))
|
||||
if sim > best {
|
||||
best = sim
|
||||
}
|
||||
}
|
||||
return best
|
||||
}
|
||||
|
||||
func levenshtein(a, b string) int {
|
||||
ra := []rune(a)
|
||||
rb := []rune(b)
|
||||
la, lb := len(ra), len(rb)
|
||||
if la == 0 {
|
||||
return lb
|
||||
}
|
||||
if lb == 0 {
|
||||
return la
|
||||
}
|
||||
prev := make([]int, lb+1)
|
||||
curr := make([]int, lb+1)
|
||||
for j := 0; j <= lb; j++ {
|
||||
prev[j] = j
|
||||
}
|
||||
for i := 1; i <= la; i++ {
|
||||
curr[0] = i
|
||||
for j := 1; j <= lb; j++ {
|
||||
cost := 1
|
||||
if ra[i-1] == rb[j-1] {
|
||||
cost = 0
|
||||
}
|
||||
curr[j] = min3(curr[j-1]+1, prev[j]+1, prev[j-1]+cost)
|
||||
}
|
||||
prev, curr = curr, prev
|
||||
}
|
||||
return prev[lb]
|
||||
}
|
||||
|
||||
func min3(a, b, c int) int {
|
||||
if a < b {
|
||||
if a < c {
|
||||
return a
|
||||
}
|
||||
return c
|
||||
}
|
||||
if b < c {
|
||||
return b
|
||||
}
|
||||
return c
|
||||
}
|
||||
|
||||
// rerank interleaves results from different domains.
|
||||
type domainHeap []rerankItem
|
||||
|
||||
type rerankItem struct {
|
||||
score float64
|
||||
url string
|
||||
domainMul float64
|
||||
vec [12]float64
|
||||
}
|
||||
|
||||
func (h domainHeap) Len() int { return len(h) }
|
||||
func (h domainHeap) Less(i, j int) bool { return h[i].score*h[i].domainMul > h[j].score*h[j].domainMul }
|
||||
func (h domainHeap) Swap(i, j int) { h[i], h[j] = h[j], h[i] }
|
||||
func (h *domainHeap) Push(x interface{}) { *h = append(*h, x.(rerankItem)) }
|
||||
func (h *domainHeap) Pop() interface{} {
|
||||
old := *h
|
||||
n := len(old)
|
||||
x := old[n-1]
|
||||
*h = old[:n-1]
|
||||
return x
|
||||
}
|
||||
|
||||
type candidate struct {
|
||||
url string
|
||||
relevance float64
|
||||
scoreVec [12]float64
|
||||
}
|
||||
|
||||
func rerank(candidates []candidate, from, to int) []candidate {
|
||||
domainItems := make(map[string][]candidate)
|
||||
for _, c := range candidates {
|
||||
h := netloc(c.url)
|
||||
domainItems[h] = append(domainItems[h], c)
|
||||
}
|
||||
|
||||
h := &domainHeap{}
|
||||
heap.Init(h)
|
||||
domainMul := make(map[string]float64)
|
||||
|
||||
for domain, items := range domainItems {
|
||||
domainMul[domain] = 1.0
|
||||
// Sort items within domain
|
||||
sort.Slice(items, func(i, j int) bool {
|
||||
return items[i].scoreVec[0] < items[j].scoreVec[0]
|
||||
})
|
||||
top := items[len(items)-1]
|
||||
domainItems[domain] = items[:len(items)-1]
|
||||
heap.Push(h, rerankItem{top.scoreVec[0], top.url, domainMul[domain], top.scoreVec})
|
||||
}
|
||||
|
||||
var result []candidate
|
||||
for h.Len() > 0 && len(result) < to {
|
||||
item := heap.Pop(h).(rerankItem)
|
||||
if len(result) >= from {
|
||||
result = append(result, candidate{url: item.url, scoreVec: item.vec})
|
||||
}
|
||||
domain := netloc(item.url)
|
||||
domainMul[domain] /= 8
|
||||
remaining := domainItems[domain]
|
||||
if len(remaining) > 0 {
|
||||
next := remaining[len(remaining)-1]
|
||||
domainItems[domain] = remaining[:len(remaining)-1]
|
||||
heap.Push(h, rerankItem{next.scoreVec[0], next.url, domainMul[domain], next.scoreVec})
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// ---- misc ----
|
||||
|
||||
func readBodyLimited(resp *http.Response, limit int64) string {
|
||||
data := make([]byte, 0, limit)
|
||||
buf := make([]byte, 4096)
|
||||
var total int64
|
||||
for {
|
||||
n, err := resp.Body.Read(buf)
|
||||
if n > 0 {
|
||||
data = append(data, buf[:n]...)
|
||||
total += int64(n)
|
||||
if total >= limit {
|
||||
break
|
||||
}
|
||||
}
|
||||
if err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n]
|
||||
}
|
||||
|
||||
func unescapeURL(u string) string {
|
||||
decoded, err := url.PathUnescape(u)
|
||||
if err != nil {
|
||||
return u
|
||||
}
|
||||
return decoded
|
||||
}
|
||||
|
||||
func atoi(s string) int {
|
||||
n := 0
|
||||
for _, c := range s {
|
||||
if c < '0' || c > '9' {
|
||||
return n
|
||||
}
|
||||
n = n*10 + int(c-'0')
|
||||
}
|
||||
return n
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
@@ -0,0 +1,300 @@
|
||||
// Package storage provides the persistent index and site-info storage backed by bbolt.
|
||||
//
|
||||
// Index space → a single bbolt bucket "index" where key = keyword (string),
|
||||
// value = brotli-compressed JSON array of [weight, url] pairs.
|
||||
//
|
||||
// Gate (门) → a bbolt bucket "gate" where key = URL (string),
|
||||
// value = brotli-compressed JSON array [title, desc, text, timestamp].
|
||||
//
|
||||
// SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string),
|
||||
// value = brotli-compressed JSON of SiteInfo struct.
|
||||
//
|
||||
// The Python version used a custom hash-bucket scheme; here bbolt handles it natively.
|
||||
package storage
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/andybalholm/brotli"
|
||||
bolt "go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
// IndexEntry is a single entry in the inverted index.
|
||||
type IndexEntry struct {
|
||||
Weight float32 `json:"w"`
|
||||
URL string `json:"u"`
|
||||
}
|
||||
|
||||
// SnippetEntry is cached snippet data for a URL.
|
||||
type SnippetEntry struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"desc"`
|
||||
Text string `json:"text"`
|
||||
Timestamp int64 `json:"ts"`
|
||||
}
|
||||
|
||||
var (
|
||||
bucketIndex = []byte("index")
|
||||
bucketGate = []byte("gate")
|
||||
bucketSiteGate = []byte("site_gate")
|
||||
)
|
||||
|
||||
// DB wraps a bbolt database and exposes typed access methods.
|
||||
// bbolt handles its own locking internally.
|
||||
type DB struct {
|
||||
db *bolt.DB
|
||||
}
|
||||
|
||||
// Open creates or opens the bbolt database at the given directory path.
|
||||
func Open(dir string) (*DB, error) {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
|
||||
}
|
||||
path := filepath.Join(dir, "sese.db")
|
||||
db, err := bolt.Open(path, 0o600, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("storage.Open bolt: %w", err)
|
||||
}
|
||||
// Ensure buckets exist
|
||||
err = db.Update(func(tx *bolt.Tx) error {
|
||||
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
|
||||
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("storage.Open create buckets: %w", err)
|
||||
}
|
||||
return &DB{db: db}, nil
|
||||
}
|
||||
|
||||
// Close closes the underlying bbolt database.
|
||||
func (d *DB) Close() error {
|
||||
return d.db.Close()
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
|
||||
func compress(data []byte) ([]byte, error) {
|
||||
buf := make([]byte, 0, len(data))
|
||||
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6)
|
||||
if _, err := w.Write(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func decompress(data []byte) ([]byte, error) {
|
||||
r := brotli.NewReader(
|
||||
(*byteReader)(&data),
|
||||
)
|
||||
out := make([]byte, 0, len(data)*3)
|
||||
tmp := make([]byte, 4096)
|
||||
for {
|
||||
n, err := r.Read(tmp)
|
||||
out = append(out, tmp[:n]...)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// appendWriter implements io.Writer on top of a *[]byte.
|
||||
type appendWriter []byte
|
||||
|
||||
func (a *appendWriter) Write(p []byte) (int, error) {
|
||||
*a = append(*a, p...)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// byteReader wraps []byte as io.Reader.
|
||||
type byteReader []byte
|
||||
|
||||
func (b *byteReader) Read(p []byte) (int, error) {
|
||||
if len(*b) == 0 {
|
||||
return 0, io.EOF
|
||||
}
|
||||
n := copy(p, *b)
|
||||
*b = (*b)[n:]
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func marshalCompress(v any) ([]byte, error) {
|
||||
raw, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return compress(raw)
|
||||
}
|
||||
|
||||
func decompressUnmarshal(data []byte, v any) error {
|
||||
raw, err := decompress(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return json.Unmarshal(raw, v)
|
||||
}
|
||||
|
||||
// ---- Index (inverted index) ----
|
||||
|
||||
// GetIndex retrieves all IndexEntry values for a keyword.
|
||||
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
|
||||
var entries []IndexEntry
|
||||
err := d.db.View(func(tx *bolt.Tx) error {
|
||||
b := tx.Bucket(bucketIndex)
|
||||
v := b.Get([]byte(keyword))
|
||||
if v == nil {
|
||||
return nil
|
||||
}
|
||||
return decompressUnmarshal(v, &entries)
|
||||
})
|
||||
return entries, err
|
||||
}
|
||||
|
||||
// SetIndex overwrites the IndexEntry list for a keyword.
|
||||
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
|
||||
data, err := marshalCompress(entries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketIndex).Put([]byte(keyword), data)
|
||||
})
|
||||
}
|
||||
|
||||
// BatchSetIndex writes multiple keyword→entries pairs in one transaction.
|
||||
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
b := tx.Bucket(bucketIndex)
|
||||
for keyword, entries := range batch {
|
||||
data, err := marshalCompress(entries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := b.Put([]byte(keyword), data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// ForEachIndex iterates over all index entries. fn receives keyword and entries.
|
||||
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
|
||||
return d.db.View(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
|
||||
var entries []IndexEntry
|
||||
if err := decompressUnmarshal(v, &entries); err != nil {
|
||||
return nil // skip corrupted entries
|
||||
}
|
||||
return fn(string(k), entries)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// ---- Gate (URL snippet cache) ----
|
||||
|
||||
// GetSnippet retrieves the cached snippet for a URL.
|
||||
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
|
||||
var entry SnippetEntry
|
||||
err := d.db.View(func(tx *bolt.Tx) error {
|
||||
v := tx.Bucket(bucketGate).Get([]byte(url))
|
||||
if v == nil {
|
||||
return fmt.Errorf("not found")
|
||||
}
|
||||
return decompressUnmarshal(v, &entry)
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &entry, nil
|
||||
}
|
||||
|
||||
// SetSnippet stores a cached snippet for a URL.
|
||||
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
|
||||
data, err := marshalCompress(entry)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketGate).Put([]byte(url), data)
|
||||
})
|
||||
}
|
||||
|
||||
// ---- SiteGate (site metadata) ----
|
||||
|
||||
// SiteInfo mirrors the Python 网站 dataclass.
|
||||
type SiteInfo struct {
|
||||
VisitCount int `json:"visit_count"`
|
||||
LastVisitTime int64 `json:"last_visit_time"`
|
||||
Fingerprint any `json:"fingerprint,omitempty"`
|
||||
SuccessRate *float64 `json:"success_rate,omitempty"`
|
||||
HTMLStructure string `json:"html_structure,omitempty"`
|
||||
IPs []string `json:"ips,omitempty"`
|
||||
Quality *float64 `json:"quality,omitempty"`
|
||||
HTTPSAvailable *bool `json:"https_available,omitempty"`
|
||||
Keywords []string `json:"keywords,omitempty"`
|
||||
OutLinks []string `json:"out_links,omitempty"`
|
||||
Languages map[string]float64 `json:"languages,omitempty"`
|
||||
Redirects map[string]string `json:"redirects,omitempty"`
|
||||
ServerTypes []string `json:"server_types,omitempty"`
|
||||
}
|
||||
|
||||
// GetSiteInfo retrieves metadata for a hostname.
|
||||
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
||||
var info SiteInfo
|
||||
err := d.db.View(func(tx *bolt.Tx) error {
|
||||
v := tx.Bucket(bucketSiteGate).Get([]byte(host))
|
||||
if v == nil {
|
||||
return fmt.Errorf("not found")
|
||||
}
|
||||
return decompressUnmarshal(v, &info)
|
||||
})
|
||||
if err != nil {
|
||||
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
|
||||
}
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
return &info, nil
|
||||
}
|
||||
|
||||
// SetSiteInfo stores metadata for a hostname.
|
||||
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
|
||||
data, err := marshalCompress(info)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketSiteGate).Put([]byte(host), data)
|
||||
})
|
||||
}
|
||||
|
||||
// ForEachSite iterates over all site metadata entries.
|
||||
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
|
||||
return d.db.View(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
|
||||
var info SiteInfo
|
||||
if err := decompressUnmarshal(v, &info); err != nil {
|
||||
return nil
|
||||
}
|
||||
return fn(string(k), &info)
|
||||
})
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user