sese-engine-go/crawler/crawler.go

// crawler.go — BFS crawl loop, URL scheduling, and site-info updating.
package crawler

import (
	"bytes"
	"encoding/json"
	"log"
	"math"
	"math/rand"
	"net/http"
	"net/url"
	"strings"
	"sync"
	"sync/atomic"
	"time"

	"sese-engine/analyzer"
	"sese-engine/config"
	"sese-engine/parser"
	"sese-engine/storage"
)


// Stats holds real-time crawl counters (read with atomic).
type Stats struct {
	VisitedURLs    int64
	SuccessURLs    int64
	KeywordsFetched int64
}

// Crawler orchestrates the BFS crawl.
type Crawler struct {
	fetcher    *Fetcher
	db         *storage.DB
	analyzer   *analyzer.Analyzer
	prosperMap map[string]float64 // domain → backlink score (loaded from info)
	stats      Stats
}

// New creates a Crawler.
func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler {
	return &Crawler{
		fetcher:    NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second),
		db:         db,
		analyzer:   a,
		prosperMap: prosperMap,
	}
}

// URLWeight pairs a URL with its discovery weight.
type URLWeight struct {
	URL    string
	Weight float64
}

// Run starts the BFS crawl from entryURL, running for maxEpoch rounds.
// It blocks until completion.
func (c *Crawler) Run(entryURL string, maxEpoch int) {
	visited := make(map[string]bool)
	queue := []string{entryURL}

	for ep := 0; ep < maxEpoch; ep++ {
		log.Printf("[crawler] epoch %d/%d  queue=%d", ep+1, maxEpoch, len(queue))
		for _, u := range queue {
			visited[u] = true
		}

		var (
			newLinks []URLWeight
			mu       sync.Mutex
			wg       sync.WaitGroup
		)

		sem := make(chan struct{}, config.CrawlerWorkers)
		for _, u := range queue {
			wg.Add(1)
			sem <- struct{}{}
			go func(rawURL string) {
				defer wg.Done()
				defer func() { <-sem }()
				hrefs := c.visitURL(rawURL)
				n := len(hrefs)
				if n > 0 {
					w := 1.0 / float64(n)
					mu.Lock()
					for _, h := range hrefs {
						if !visited[h] {
							newLinks = append(newLinks, URLWeight{URL: h, Weight: w})
						}
					}
					mu.Unlock()
				}
			}(u)
		}
		wg.Wait()

		if len(newLinks) == 0 {
			log.Println("[crawler] empty queue — stopping")
			return
		}

		queue = c.schedule(newLinks)
	}
}

// visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs.
func (c *Crawler) visitURL(rawURL string) []string {
	atomic.AddInt64(&c.stats.VisitedURLs, 1)

	res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0)
	if err != nil || res == nil {
		c.updateSiteFailure(rawURL)
		return nil
	}

	atomic.AddInt64(&c.stats.SuccessURLs, 1)

	title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL)

	// Cache snippet
	if len(res.FinalURL) < 250 {
		_ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{
			Title:       title,
			Description: truncate(desc, 256),
			Text:        truncate(text, 256),
			Timestamp:   time.Now().Unix(),
		})
	}

	// Keyword extraction → send to harvester
	kws := c.analyzer.Analyze(title, desc, text)
	if len(kws) > 0 {
		if len(kws) > config.MaxKeywordsPerPage {
			kws = kws[:config.MaxKeywordsPerPage]
		}
		atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws)))
		go c.sendToHarvester(res.FinalURL, kws)
	}

	// Update site info
	host := netloc(res.FinalURL)
	c.updateSiteSuccess(host, res, title, desc, text, hrefs)

	// Handle permanent redirects in site info
	for from, to := range res.Redirects {
		fromHost := netloc(from)
		if fromHost == "" {
			continue
		}
		info, _ := c.db.GetSiteInfo(fromHost)
		if info.Redirects == nil {
			info.Redirects = make(map[string]string)
		}
		info.Redirects[from] = to
		if len(info.Redirects) > 50 {
			// keep most important (just truncate randomly for now)
			info.Redirects = truncateMap(info.Redirects, 40)
		}
		_ = c.db.SetSiteInfo(fromHost, info)
	}

	// Trim hrefs
	if len(hrefs) > 100 {
		hrefs = sampleStrings(hrefs, 100)
	}
	return hrefs
}

func (c *Crawler) updateSiteFailure(rawURL string) {
	host := netloc(rawURL)
	if host == "" {
		return
	}
	info, _ := c.db.GetSiteInfo(host)
	if info.SuccessRate == nil {
		zero := 0.0
		info.SuccessRate = &zero
	}
	*info.SuccessRate *= 0.99
	_ = c.db.SetSiteInfo(host, info)
}

func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
	info, _ := c.db.GetSiteInfo(host)

	info.VisitCount++
	info.LastVisitTime = time.Now().Unix()

	one := 1.0
	if info.SuccessRate == nil {
		info.SuccessRate = &one
	}
	*info.SuccessRate = *info.SuccessRate*0.99 + 0.01

	if strings.HasPrefix(res.FinalURL, "https://") {
		t := true
		info.HTTPSAvailable = &t
	}

	if res.ServerType != "" {
		found := false
		for _, s := range info.ServerTypes {
			if s == res.ServerType {
				found = true
				break
			}
		}
		if !found {
			info.ServerTypes = append(info.ServerTypes, res.ServerType)
			if len(info.ServerTypes) > 5 {
				info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
			}
		}
	}

	// Language detection — sample 10% or first 10 visits
	if info.VisitCount < 10 || rand.Float64() < 0.1 {
		lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
		if lang != "" {
			if info.Languages == nil {
				info.Languages = make(map[string]float64)
			}
			intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
			for k := range info.Languages {
				info.Languages[k] *= (1 - intensity)
			}
			info.Languages[lang] += intensity
		}
		// Collect external links
		superHost := superNetloc(res.FinalURL)
		var external []string
		for _, h := range hrefs {
			if superNetloc(h) != superHost {
				external = append(external, h)
			}
		}
		sampled := sampleStrings(external, 10)
		info.OutLinks = append(info.OutLinks, sampled...)
		if len(info.OutLinks) > 250 {
			info.OutLinks = sampleStrings(info.OutLinks, 200)
		}
	}

	_ = c.db.SetSiteInfo(host, info)
}

// sendToHarvester POSTs keyword data to the harvester service.
func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
	type payload struct {
		URL      string           `json:"url"`
		Keywords []analyzer.Keyword `json:"keywords"`
	}
	p := payload{URL: finalURL, Keywords: kws}
	data, err := json.Marshal(p)
	if err != nil {
		return
	}
	resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data))
	if err != nil {
		log.Printf("[crawler] harvester post failed: %v", err)
		return
	}
	resp.Body.Close()
}

// schedule selects and prioritises the next BFS queue from raw discovered links.
func (c *Crawler) schedule(links []URLWeight) []string {
	if len(links) > 100000 {
		links = sampleURLWeights(links, 100000)
	}

	// Pre-fetch site info for all involved domains
	domains := make(map[string]bool)
	for _, lw := range links {
		if h := netloc(lw.URL); h != "" {
			domains[h] = true
		}
		if h := superNetloc(lw.URL); h != "" {
			domains[h] = true
		}
	}
	siteCache := make(map[string]*storage.SiteInfo, len(domains))
	var mu sync.Mutex
	var wg sync.WaitGroup
	for d := range domains {
		wg.Add(1)
		go func(host string) {
			defer wg.Done()
			info, _ := c.db.GetSiteInfo(host)
			mu.Lock()
			siteCache[host] = info
			mu.Unlock()
		}(d)
	}
	wg.Wait()

	// Score each URL
	scored_list := make([]scoredURL, len(links))
	for i, lw := range links {
		scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)}
	}

	// Weighted random sample (45000 or 1/3+250 whichever smaller)
	k := min(45000, len(scored_list)/3+250)
	selected := weightedSample(scored_list, k)

	// Domain concentration filtering
	selected = concentrationFilter(selected, config.CrawlFocus)

	// Separate https/http, cap http at 1/4 of https count
	var httpsURLs, httpURLs []string
	for _, s := range selected {
		if strings.HasPrefix(s, "https://") {
			httpsURLs = append(httpsURLs, s)
		} else {
			httpURLs = append(httpURLs, s)
		}
	}
	maxHTTP := len(httpsURLs) / 4
	if len(httpURLs) > maxHTTP {
		httpURLs = sampleStrings(httpURLs, maxHTTP)
	}

	// Separate prosperous / non-prosperous
	var prosperURLs, otherURLs []string
	for _, u := range append(httpsURLs, httpURLs...) {
		if c.prosperMap[netloc(u)] > 0 {
			prosperURLs = append(prosperURLs, u)
		} else {
			otherURLs = append(otherURLs, u)
		}
	}
	n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio)
	if len(otherURLs) > n {
		keep := max(len(otherURLs)-len(selected)/10, n)
		if keep < len(otherURLs) {
			otherURLs = sampleStrings(otherURLs, keep)
		}
	}

	result := append(prosperURLs, otherURLs...)
	rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
	return result
}

// scoreURL computes the scheduling priority for a URL.
func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 {
	host := netloc(lw.URL)
	super := superNetloc(lw.URL)

	info := siteCache[host]
	if info == nil {
		info = &storage.SiteInfo{}
	}

	// Chinese-ness
	var chineseness float64 = 0.5
	if len(info.Languages) > 0 {
		total := 0.0
		for _, v := range info.Languages {
			total += v
		}
		if total > 0 {
			chineseness = info.Languages["zh"] / total
		}
	}

	// Interest decay based on visit count
	prosper := math.Min(62, c.prosperMap[host])
	limit := prosper*500 + 50
	b := math.Pow(0.1, 1/limit)
	interest := math.Pow(b, float64(info.VisitCount))

	var interest2 float64 = 1.0
	if super != host {
		superInfo := siteCache[super]
		if superInfo != nil {
			limit2 := math.Min(62, c.prosperMap[super])*500 + 50
			b2 := math.Pow(0.1, 1/limit2)
			interest2 = math.Pow(b2, float64(superInfo.VisitCount))
		}
	}

	quality := 1.0
	if info.Quality != nil {
		quality = *info.Quality
	}

	prosperity := prosper
	if prosperity > 0 {
		prosperity += 0.5
	}
	prosperity = math.Log2(2+prosperity) + 1

	bad := badURL(lw.URL)
	return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity
}

// ---- helper functions ----

func netloc(rawURL string) string {
	parts := strings.SplitN(rawURL, "/", 4)
	if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" {
		return parts[2]
	}
	u, err := url.Parse(rawURL)
	if err != nil {
		return ""
	}
	return u.Host
}

// superNetloc returns "domain.tld" (strips subdomains).
func superNetloc(rawURL string) string {
	host := netloc(rawURL)
	parts := strings.Split(host, ".")
	if len(parts) >= 2 {
		return strings.Join(parts[len(parts)-2:], ".")
	}
	return host
}

func badURL(u string) float64 {
	s := math.Max(0, float64(len(u)-30)/200.0)
	if strings.Contains(u, ".htm") || strings.Contains(u, ".php") {
		s += (1 - s) * 0.3
	}
	if strings.Count(strings.TrimRight(u, "/"), "/") > 2 {
		s += (1 - s) * 0.1
	}
	if len(u) < 5 || u[4] == ':' {
		s += (1 - s) * 0.3
	}
	return math.Min(s, 0.9)
}

func truncate(s string, n int) string {
	if len(s) <= n {
		return s
	}
	return s[:n]
}

func sampleStrings(s []string, n int) []string {
	if len(s) <= n {
		return s
	}
	perm := rand.Perm(len(s))
	out := make([]string, n)
	for i := range out {
		out[i] = s[perm[i]]
	}
	return out
}

func sampleURLWeights(s []URLWeight, n int) []URLWeight {
	if len(s) <= n {
		return s
	}
	perm := rand.Perm(len(s))
	out := make([]URLWeight, n)
	for i := range out {
		out[i] = s[perm[i]]
	}
	return out
}

type scoredURL struct {
	url   string
	score float64
}

func weightedSample(items []scoredURL, k int) []string {
	if k >= len(items) {
		out := make([]string, len(items))
		for i, s := range items {
			out[i] = s.url
		}
		return out
	}
	// Simple weighted sampling without replacement using alias method approximation
	totalWeight := 0.0
	for _, s := range items {
		totalWeight += s.score
	}
	selected := make(map[int]bool)
	out := make([]string, 0, k)
	for len(out) < k && len(selected) < len(items) {
		r := rand.Float64() * totalWeight
		cum := 0.0
		for i, s := range items {
			if selected[i] {
				continue
			}
			cum += s.score
			if cum >= r {
				selected[i] = true
				out = append(out, s.url)
				totalWeight -= s.score
				break
			}
		}
	}
	return out
}

func concentrationFilter(urls []string, k float64) []string {
	domainGroups := make(map[string][]string)
	shuffled := make([]string, len(urls))
	copy(shuffled, urls)
	rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] })

	for _, u := range shuffled {
		d := superNetloc(u)
		domainGroups[d] = append(domainGroups[d], u)
	}

	limit := 10
	if len(domainGroups) > 1 {
		sizes := make([]int, 0, len(domainGroups))
		for _, g := range domainGroups {
			sizes = append(sizes, int(math.Pow(float64(len(g)), k)))
		}
		// sort sizes ascending, drop last (largest)
		for i := 0; i < len(sizes)-1; i++ {
			for j := i + 1; j < len(sizes)-1; j++ {
				if sizes[j] < sizes[i] {
					sizes[i], sizes[j] = sizes[j], sizes[i]
				}
			}
		}
		total := 0
		for _, s := range sizes[:len(sizes)-1] {
			total += s
		}
		limit = max(10, int(float64(total)*0.6))
	}

	var result []string
	for _, g := range domainGroups {
		sn := 1 + min(limit, int(math.Pow(float64(len(g)), k)))
		if sn > len(g) {
			sn = len(g)
		}
		result = append(result, g[:sn]...)
	}
	rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] })
	return result
}

func truncateMap(m map[string]string, n int) map[string]string {
	if len(m) <= n {
		return m
	}
	out := make(map[string]string, n)
	i := 0
	for k, v := range m {
		if i >= n {
			break
		}
		out[k] = v
		i++
	}
	return out
}

func min(a, b int) int {
	if a < b {
		return a
	}
	return b
}

func max(a, b int) int {
	if a > b {
		return a
	}
	return b
}

// Expose Stats for monitoring.
func (c *Crawler) GetStats() Stats {
	return Stats{
		VisitedURLs:     atomic.LoadInt64(&c.stats.VisitedURLs),
		SuccessURLs:     atomic.LoadInt64(&c.stats.SuccessURLs),
		KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched),
	}
}