Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1,588 @@
|
||||
// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
|
||||
package crawler
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"log"
|
||||
"math"
|
||||
"math/rand"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"sese-engine/analyzer"
|
||||
"sese-engine/config"
|
||||
"sese-engine/parser"
|
||||
"sese-engine/storage"
|
||||
)
|
||||
|
||||
|
||||
// Stats holds real-time crawl counters (read with atomic).
|
||||
type Stats struct {
|
||||
VisitedURLs int64
|
||||
SuccessURLs int64
|
||||
KeywordsFetched int64
|
||||
}
|
||||
|
||||
// Crawler orchestrates the BFS crawl.
|
||||
type Crawler struct {
|
||||
fetcher *Fetcher
|
||||
db *storage.DB
|
||||
analyzer *analyzer.Analyzer
|
||||
prosperMap map[string]float64 // domain → backlink score (loaded from info)
|
||||
stats Stats
|
||||
}
|
||||
|
||||
// New creates a Crawler.
|
||||
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
|
||||
return &Crawler{
|
||||
fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
|
||||
db: db,
|
||||
analyzer: a,
|
||||
prosperMap: prosperMap,
|
||||
}
|
||||
}
|
||||
|
||||
// URLWeight pairs a URL with its discovery weight.
|
||||
type URLWeight struct {
|
||||
URL string
|
||||
Weight float64
|
||||
}
|
||||
|
||||
// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
|
||||
// It blocks until completion.
|
||||
func (c *Crawler) Run(entryURL string, maxEpoch int) {
|
||||
visited := make(map[string]bool)
|
||||
queue := []string{entryURL}
|
||||
|
||||
for ep := 0; ep < maxEpoch; ep++ {
|
||||
log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue))
|
||||
for _, u := range queue {
|
||||
visited[u] = true
|
||||
}
|
||||
|
||||
var (
|
||||
newLinks []URLWeight
|
||||
mu sync.Mutex
|
||||
wg sync.WaitGroup
|
||||
)
|
||||
|
||||
sem := make(chan struct{}, config.CrawlerWorkers)
|
||||
for _, u := range queue {
|
||||
wg.Add(1)
|
||||
sem <- struct{}{}
|
||||
go func(rawURL string) {
|
||||
defer wg.Done()
|
||||
defer func() { <-sem }()
|
||||
hrefs := c.visitURL(rawURL)
|
||||
n := len(hrefs)
|
||||
if n > 0 {
|
||||
w := 1.0 / float64(n)
|
||||
mu.Lock()
|
||||
for _, h := range hrefs {
|
||||
if !visited[h] {
|
||||
newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
|
||||
}
|
||||
}
|
||||
mu.Unlock()
|
||||
}
|
||||
}(u)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
if len(newLinks) == 0 {
|
||||
log.Println("[crawler] empty queue — stopping")
|
||||
return
|
||||
}
|
||||
|
||||
queue = c.schedule(newLinks)
|
||||
}
|
||||
}
|
||||
|
||||
// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
|
||||
func (c *Crawler) visitURL(rawURL string) []string {
|
||||
atomic.AddInt64(&c.stats.VisitedURLs, 1)
|
||||
|
||||
res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
|
||||
if err != nil || res == nil {
|
||||
c.updateSiteFailure(rawURL)
|
||||
return nil
|
||||
}
|
||||
|
||||
atomic.AddInt64(&c.stats.SuccessURLs, 1)
|
||||
|
||||
title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)
|
||||
|
||||
// Cache snippet
|
||||
if len(res.FinalURL) < 250 {
|
||||
_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
|
||||
Title: title,
|
||||
Description: truncate(desc, 256),
|
||||
Text: truncate(text, 256),
|
||||
Timestamp: time.Now().Unix(),
|
||||
})
|
||||
}
|
||||
|
||||
// Keyword extraction → send to harvester
|
||||
kws := c.analyzer.Analyze(title, desc, text)
|
||||
if len(kws) > 0 {
|
||||
if len(kws) > config.MaxKeywordsPerPage {
|
||||
kws = kws[:config.MaxKeywordsPerPage]
|
||||
}
|
||||
atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
|
||||
go c.sendToHarvester(res.FinalURL, kws)
|
||||
}
|
||||
|
||||
// Update site info
|
||||
host := netloc(res.FinalURL)
|
||||
c.updateSiteSuccess(host, res, title, desc, text, hrefs)
|
||||
|
||||
// Handle permanent redirects in site info
|
||||
for from, to := range res.Redirects {
|
||||
fromHost := netloc(from)
|
||||
if fromHost == "" {
|
||||
continue
|
||||
}
|
||||
info, _ := c.db.GetSiteInfo(fromHost)
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
info.Redirects[from] = to
|
||||
if len(info.Redirects) > 50 {
|
||||
// keep most important (just truncate randomly for now)
|
||||
info.Redirects = truncateMap(info.Redirects, 40)
|
||||
}
|
||||
_ = c.db.SetSiteInfo(fromHost, info)
|
||||
}
|
||||
|
||||
// Trim hrefs
|
||||
if len(hrefs) > 100 {
|
||||
hrefs = sampleStrings(hrefs, 100)
|
||||
}
|
||||
return hrefs
|
||||
}
|
||||
|
||||
func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
host := netloc(rawURL)
|
||||
if host == "" {
|
||||
return
|
||||
}
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
if info.SuccessRate == nil {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
}
|
||||
*info.SuccessRate *= 0.99
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = time.Now().Unix()
|
||||
|
||||
one := 1.0
|
||||
if info.SuccessRate == nil {
|
||||
info.SuccessRate = &one
|
||||
}
|
||||
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
||||
|
||||
if strings.HasPrefix(res.FinalURL, "https://") {
|
||||
t := true
|
||||
info.HTTPSAvailable = &t
|
||||
}
|
||||
|
||||
if res.ServerType != "" {
|
||||
found := false
|
||||
for _, s := range info.ServerTypes {
|
||||
if s == res.ServerType {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
info.ServerTypes = append(info.ServerTypes, res.ServerType)
|
||||
if len(info.ServerTypes) > 5 {
|
||||
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Language detection — sample 10% or first 10 visits
|
||||
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
||||
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
||||
if lang != "" {
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
||||
for k := range info.Languages {
|
||||
info.Languages[k] *= (1 - intensity)
|
||||
}
|
||||
info.Languages[lang] += intensity
|
||||
}
|
||||
// Collect external links
|
||||
superHost := superNetloc(res.FinalURL)
|
||||
var external []string
|
||||
for _, h := range hrefs {
|
||||
if superNetloc(h) != superHost {
|
||||
external = append(external, h)
|
||||
}
|
||||
}
|
||||
sampled := sampleStrings(external, 10)
|
||||
info.OutLinks = append(info.OutLinks, sampled...)
|
||||
if len(info.OutLinks) > 250 {
|
||||
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
||||
}
|
||||
}
|
||||
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
}
|
||||
|
||||
// sendToHarvester POSTs keyword data to the harvester service.
|
||||
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
||||
type payload struct {
|
||||
URL string `json:"url"`
|
||||
Keywords []analyzer.Keyword `json:"keywords"`
|
||||
}
|
||||
p := payload{URL: finalURL, Keywords: kws}
|
||||
data, err := json.Marshal(p)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data))
|
||||
if err != nil {
|
||||
log.Printf("[crawler] harvester post failed: %v", err)
|
||||
return
|
||||
}
|
||||
resp.Body.Close()
|
||||
}
|
||||
|
||||
// schedule selects and prioritises the next BFS queue from raw discovered links.
|
||||
func (c *Crawler) schedule(links []URLWeight) []string {
|
||||
if len(links) > 100000 {
|
||||
links = sampleURLWeights(links, 100000)
|
||||
}
|
||||
|
||||
// Pre-fetch site info for all involved domains
|
||||
domains := make(map[string]bool)
|
||||
for _, lw := range links {
|
||||
if h := netloc(lw.URL); h != "" {
|
||||
domains[h] = true
|
||||
}
|
||||
if h := superNetloc(lw.URL); h != "" {
|
||||
domains[h] = true
|
||||
}
|
||||
}
|
||||
siteCache := make(map[string]*storage.SiteInfo, len(domains))
|
||||
var mu sync.Mutex
|
||||
var wg sync.WaitGroup
|
||||
for d := range domains {
|
||||
wg.Add(1)
|
||||
go func(host string) {
|
||||
defer wg.Done()
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
mu.Lock()
|
||||
siteCache[host] = info
|
||||
mu.Unlock()
|
||||
}(d)
|
||||
}
|
||||
wg.Wait()
|
||||
|
||||
// Score each URL
|
||||
scored_list := make([]scoredURL, len(links))
|
||||
for i, lw := range links {
|
||||
scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
|
||||
}
|
||||
|
||||
// Weighted random sample (45000 or 1/3+250 whichever smaller)
|
||||
k := min(45000, len(scored_list)/3+250)
|
||||
selected := weightedSample(scored_list, k)
|
||||
|
||||
// Domain concentration filtering
|
||||
selected = concentrationFilter(selected, config.CrawlFocus)
|
||||
|
||||
// Separate https/http, cap http at 1/4 of https count
|
||||
var httpsURLs, httpURLs []string
|
||||
for _, s := range selected {
|
||||
if strings.HasPrefix(s, "https://") {
|
||||
httpsURLs = append(httpsURLs, s)
|
||||
} else {
|
||||
httpURLs = append(httpURLs, s)
|
||||
}
|
||||
}
|
||||
maxHTTP := len(httpsURLs) / 4
|
||||
if len(httpURLs) > maxHTTP {
|
||||
httpURLs = sampleStrings(httpURLs, maxHTTP)
|
||||
}
|
||||
|
||||
// Separate prosperous / non-prosperous
|
||||
var prosperURLs, otherURLs []string
|
||||
for _, u := range append(httpsURLs, httpURLs...) {
|
||||
if c.prosperMap[netloc(u)] > 0 {
|
||||
prosperURLs = append(prosperURLs, u)
|
||||
} else {
|
||||
otherURLs = append(otherURLs, u)
|
||||
}
|
||||
}
|
||||
n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
|
||||
if len(otherURLs) > n {
|
||||
keep := max(len(otherURLs)-len(selected)/10, n)
|
||||
if keep < len(otherURLs) {
|
||||
otherURLs = sampleStrings(otherURLs, keep)
|
||||
}
|
||||
}
|
||||
|
||||
result := append(prosperURLs, otherURLs...)
|
||||
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
||||
return result
|
||||
}
|
||||
|
||||
// scoreURL computes the scheduling priority for a URL.
|
||||
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
|
||||
host := netloc(lw.URL)
|
||||
super := superNetloc(lw.URL)
|
||||
|
||||
info := siteCache[host]
|
||||
if info == nil {
|
||||
info = &storage.SiteInfo{}
|
||||
}
|
||||
|
||||
// Chinese-ness
|
||||
var chineseness float64 = 0.5
|
||||
if len(info.Languages) > 0 {
|
||||
total := 0.0
|
||||
for _, v := range info.Languages {
|
||||
total += v
|
||||
}
|
||||
if total > 0 {
|
||||
chineseness = info.Languages["zh"] / total
|
||||
}
|
||||
}
|
||||
|
||||
// Interest decay based on visit count
|
||||
prosper := math.Min(62, c.prosperMap[host])
|
||||
limit := prosper*500 + 50
|
||||
b := math.Pow(0.1, 1/limit)
|
||||
interest := math.Pow(b, float64(info.VisitCount))
|
||||
|
||||
var interest2 float64 = 1.0
|
||||
if super != host {
|
||||
superInfo := siteCache[super]
|
||||
if superInfo != nil {
|
||||
limit2 := math.Min(62, c.prosperMap[super])*500 + 50
|
||||
b2 := math.Pow(0.1, 1/limit2)
|
||||
interest2 = math.Pow(b2, float64(superInfo.VisitCount))
|
||||
}
|
||||
}
|
||||
|
||||
quality := 1.0
|
||||
if info.Quality != nil {
|
||||
quality = *info.Quality
|
||||
}
|
||||
|
||||
prosperity := prosper
|
||||
if prosperity > 0 {
|
||||
prosperity += 0.5
|
||||
}
|
||||
prosperity = math.Log2(2+prosperity) + 1
|
||||
|
||||
bad := badURL(lw.URL)
|
||||
return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
|
||||
}
|
||||
|
||||
// ---- helper functions ----
|
||||
|
||||
func netloc(rawURL string) string {
|
||||
parts := strings.SplitN(rawURL, "/", 4)
|
||||
if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
|
||||
return parts[2]
|
||||
}
|
||||
u, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return ""
|
||||
}
|
||||
return u.Host
|
||||
}
|
||||
|
||||
// superNetloc returns "domain.tld" (strips subdomains).
|
||||
func superNetloc(rawURL string) string {
|
||||
host := netloc(rawURL)
|
||||
parts := strings.Split(host, ".")
|
||||
if len(parts) >= 2 {
|
||||
return strings.Join(parts[len(parts)-2:], ".")
|
||||
}
|
||||
return host
|
||||
}
|
||||
|
||||
func badURL(u string) float64 {
|
||||
s := math.Max(0, float64(len(u)-30)/200.0)
|
||||
if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
|
||||
s += (1 - s) * 0.1
|
||||
}
|
||||
if len(u) < 5 || u[4] == ':' {
|
||||
s += (1 - s) * 0.3
|
||||
}
|
||||
return math.Min(s, 0.9)
|
||||
}
|
||||
|
||||
func truncate(s string, n int) string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
return s[:n]
|
||||
}
|
||||
|
||||
func sampleStrings(s []string, n int) []string {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
perm := rand.Perm(len(s))
|
||||
out := make([]string, n)
|
||||
for i := range out {
|
||||
out[i] = s[perm[i]]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func sampleURLWeights(s []URLWeight, n int) []URLWeight {
|
||||
if len(s) <= n {
|
||||
return s
|
||||
}
|
||||
perm := rand.Perm(len(s))
|
||||
out := make([]URLWeight, n)
|
||||
for i := range out {
|
||||
out[i] = s[perm[i]]
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
type scoredURL struct {
|
||||
url string
|
||||
score float64
|
||||
}
|
||||
|
||||
func weightedSample(items []scoredURL, k int) []string {
|
||||
if k >= len(items) {
|
||||
out := make([]string, len(items))
|
||||
for i, s := range items {
|
||||
out[i] = s.url
|
||||
}
|
||||
return out
|
||||
}
|
||||
// Simple weighted sampling without replacement using alias method approximation
|
||||
totalWeight := 0.0
|
||||
for _, s := range items {
|
||||
totalWeight += s.score
|
||||
}
|
||||
selected := make(map[int]bool)
|
||||
out := make([]string, 0, k)
|
||||
for len(out) < k && len(selected) < len(items) {
|
||||
r := rand.Float64() * totalWeight
|
||||
cum := 0.0
|
||||
for i, s := range items {
|
||||
if selected[i] {
|
||||
continue
|
||||
}
|
||||
cum += s.score
|
||||
if cum >= r {
|
||||
selected[i] = true
|
||||
out = append(out, s.url)
|
||||
totalWeight -= s.score
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func concentrationFilter(urls []string, k float64) []string {
|
||||
domainGroups := make(map[string][]string)
|
||||
shuffled := make([]string, len(urls))
|
||||
copy(shuffled, urls)
|
||||
rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })
|
||||
|
||||
for _, u := range shuffled {
|
||||
d := superNetloc(u)
|
||||
domainGroups[d] = append(domainGroups[d], u)
|
||||
}
|
||||
|
||||
limit := 10
|
||||
if len(domainGroups) > 1 {
|
||||
sizes := make([]int, 0, len(domainGroups))
|
||||
for _, g := range domainGroups {
|
||||
sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
|
||||
}
|
||||
// sort sizes ascending, drop last (largest)
|
||||
for i := 0; i < len(sizes)-1; i++ {
|
||||
for j := i + 1; j < len(sizes)-1; j++ {
|
||||
if sizes[j] < sizes[i] {
|
||||
sizes[i], sizes[j] = sizes[j], sizes[i]
|
||||
}
|
||||
}
|
||||
}
|
||||
total := 0
|
||||
for _, s := range sizes[:len(sizes)-1] {
|
||||
total += s
|
||||
}
|
||||
limit = max(10, int(float64(total)*0.6))
|
||||
}
|
||||
|
||||
var result []string
|
||||
for _, g := range domainGroups {
|
||||
sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
|
||||
if sn > len(g) {
|
||||
sn = len(g)
|
||||
}
|
||||
result = append(result, g[:sn]...)
|
||||
}
|
||||
rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
|
||||
return result
|
||||
}
|
||||
|
||||
func truncateMap(m map[string]string, n int) map[string]string {
|
||||
if len(m) <= n {
|
||||
return m
|
||||
}
|
||||
out := make(map[string]string, n)
|
||||
i := 0
|
||||
for k, v := range m {
|
||||
if i >= n {
|
||||
break
|
||||
}
|
||||
out[k] = v
|
||||
i++
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func min(a, b int) int {
|
||||
if a < b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
||||
|
||||
// Expose Stats for monitoring.
|
||||
func (c *Crawler) GetStats() Stats {
|
||||
return Stats{
|
||||
VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs),
|
||||
SuccessURLs: atomic.LoadInt64(&c.stats.SuccessURLs),
|
||||
KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user