// crawler.go — BFS crawl loop, URL scheduling, and site-info updating. package crawler import ( "bytes" "encoding/json" "log" "math" "math/rand" "net/http" "net/url" "strings" "sync" "sync/atomic" "time" "sese-engine/analyzer" "sese-engine/config" "sese-engine/parser" "sese-engine/storage" ) // Stats holds real-time crawl counters (read with atomic). type Stats struct { VisitedURLs int64 SuccessURLs int64 KeywordsFetched int64 } // Crawler orchestrates the BFS crawl. type Crawler struct { fetcher *Fetcher db *storage.DB analyzer *analyzer.Analyzer prosperMap map[string]float64 // domain → backlink score (loaded from info) stats Stats } // New creates a Crawler. func New(db *storage.DB, a *analyzer.Analyzer, prosperMap map[string]float64) *Crawler { return &Crawler{ fetcher: NewFetcher(config.SpiderName, config.CrawlerCooldown*time.Second), db: db, analyzer: a, prosperMap: prosperMap, } } // URLWeight pairs a URL with its discovery weight. type URLWeight struct { URL string Weight float64 } // Run starts the BFS crawl from entryURL, running for maxEpoch rounds. // It blocks until completion. func (c *Crawler) Run(entryURL string, maxEpoch int) { visited := make(map[string]bool) queue := []string{entryURL} for ep := 0; ep < maxEpoch; ep++ { log.Printf("[crawler] epoch %d/%d queue=%d", ep+1, maxEpoch, len(queue)) for _, u := range queue { visited[u] = true } var ( newLinks []URLWeight mu sync.Mutex wg sync.WaitGroup ) sem := make(chan struct{}, config.CrawlerWorkers) for _, u := range queue { wg.Add(1) sem <- struct{}{} go func(rawURL string) { defer wg.Done() defer func() { <-sem }() hrefs := c.visitURL(rawURL) n := len(hrefs) if n > 0 { w := 1.0 / float64(n) mu.Lock() for _, h := range hrefs { if !visited[h] { newLinks = append(newLinks, URLWeight{URL: h, Weight: w}) } } mu.Unlock() } }(u) } wg.Wait() if len(newLinks) == 0 { log.Println("[crawler] empty queue — stopping") return } queue = c.schedule(newLinks) } } // visitURL fetches a URL, stores keywords, updates site info, returns discovered hrefs. func (c *Crawler) visitURL(rawURL string) []string { atomic.AddInt64(&c.stats.VisitedURLs, 1) res, err := c.fetcher.fetchWithHistory(rawURL, true, 10*time.Second, 0) if err != nil || res == nil { c.updateSiteFailure(rawURL) return nil } atomic.AddInt64(&c.stats.SuccessURLs, 1) title, desc, text, hrefs := parser.ParseHTML(res.Body, res.FinalURL) // Cache snippet if len(res.FinalURL) < 250 { _ = c.db.SetSnippet(res.FinalURL, &storage.SnippetEntry{ Title: title, Description: truncate(desc, 256), Text: truncate(text, 256), Timestamp: time.Now().Unix(), }) } // Keyword extraction → send to harvester kws := c.analyzer.Analyze(title, desc, text) if len(kws) > 0 { if len(kws) > config.MaxKeywordsPerPage { kws = kws[:config.MaxKeywordsPerPage] } atomic.AddInt64(&c.stats.KeywordsFetched, int64(len(kws))) go c.sendToHarvester(res.FinalURL, kws) } // Update site info host := netloc(res.FinalURL) c.updateSiteSuccess(host, res, title, desc, text, hrefs) // Handle permanent redirects in site info for from, to := range res.Redirects { fromHost := netloc(from) if fromHost == "" { continue } info, _ := c.db.GetSiteInfo(fromHost) if info.Redirects == nil { info.Redirects = make(map[string]string) } info.Redirects[from] = to if len(info.Redirects) > 50 { // keep most important (just truncate randomly for now) info.Redirects = truncateMap(info.Redirects, 40) } _ = c.db.SetSiteInfo(fromHost, info) } // Trim hrefs if len(hrefs) > 100 { hrefs = sampleStrings(hrefs, 100) } return hrefs } func (c *Crawler) updateSiteFailure(rawURL string) { host := netloc(rawURL) if host == "" { return } info, _ := c.db.GetSiteInfo(host) if info.SuccessRate == nil { zero := 0.0 info.SuccessRate = &zero } *info.SuccessRate *= 0.99 _ = c.db.SetSiteInfo(host, info) } func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) { info, _ := c.db.GetSiteInfo(host) info.VisitCount++ info.LastVisitTime = time.Now().Unix() one := 1.0 if info.SuccessRate == nil { info.SuccessRate = &one } *info.SuccessRate = *info.SuccessRate*0.99 + 0.01 if strings.HasPrefix(res.FinalURL, "https://") { t := true info.HTTPSAvailable = &t } if res.ServerType != "" { found := false for _, s := range info.ServerTypes { if s == res.ServerType { found = true break } } if !found { info.ServerTypes = append(info.ServerTypes, res.ServerType) if len(info.ServerTypes) > 5 { info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:] } } } // Language detection — sample 10% or first 10 visits if info.VisitCount < 10 || rand.Float64() < 0.1 { lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text) if lang != "" { if info.Languages == nil { info.Languages = make(map[string]float64) } intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1))) for k := range info.Languages { info.Languages[k] *= (1 - intensity) } info.Languages[lang] += intensity } // Collect external links superHost := superNetloc(res.FinalURL) var external []string for _, h := range hrefs { if superNetloc(h) != superHost { external = append(external, h) } } sampled := sampleStrings(external, 10) info.OutLinks = append(info.OutLinks, sampled...) if len(info.OutLinks) > 250 { info.OutLinks = sampleStrings(info.OutLinks, 200) } } _ = c.db.SetSiteInfo(host, info) } // sendToHarvester POSTs keyword data to the harvester service. func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) { type payload struct { URL string `json:"url"` Keywords []analyzer.Keyword `json:"keywords"` } p := payload{URL: finalURL, Keywords: kws} data, err := json.Marshal(p) if err != nil { return } resp, err := http.Post(config.HarvesterAddr+"/l", "application/json", bytes.NewReader(data)) if err != nil { log.Printf("[crawler] harvester post failed: %v", err) return } resp.Body.Close() } // schedule selects and prioritises the next BFS queue from raw discovered links. func (c *Crawler) schedule(links []URLWeight) []string { if len(links) > 100000 { links = sampleURLWeights(links, 100000) } // Pre-fetch site info for all involved domains domains := make(map[string]bool) for _, lw := range links { if h := netloc(lw.URL); h != "" { domains[h] = true } if h := superNetloc(lw.URL); h != "" { domains[h] = true } } siteCache := make(map[string]*storage.SiteInfo, len(domains)) var mu sync.Mutex var wg sync.WaitGroup for d := range domains { wg.Add(1) go func(host string) { defer wg.Done() info, _ := c.db.GetSiteInfo(host) mu.Lock() siteCache[host] = info mu.Unlock() }(d) } wg.Wait() // Score each URL scored_list := make([]scoredURL, len(links)) for i, lw := range links { scored_list[i] = scoredURL{url: lw.URL, score: c.scoreURL(lw, siteCache)} } // Weighted random sample (45000 or 1/3+250 whichever smaller) k := min(45000, len(scored_list)/3+250) selected := weightedSample(scored_list, k) // Domain concentration filtering selected = concentrationFilter(selected, config.CrawlFocus) // Separate https/http, cap http at 1/4 of https count var httpsURLs, httpURLs []string for _, s := range selected { if strings.HasPrefix(s, "https://") { httpsURLs = append(httpsURLs, s) } else { httpURLs = append(httpURLs, s) } } maxHTTP := len(httpsURLs) / 4 if len(httpURLs) > maxHTTP { httpURLs = sampleStrings(httpURLs, maxHTTP) } // Separate prosperous / non-prosperous var prosperURLs, otherURLs []string for _, u := range append(httpsURLs, httpURLs...) { if c.prosperMap[netloc(u)] > 0 { prosperURLs = append(prosperURLs, u) } else { otherURLs = append(otherURLs, u) } } n := int(float64(len(prosperURLs)) * (1-config.ExpectedProsperRatio) / config.ExpectedProsperRatio) if len(otherURLs) > n { keep := max(len(otherURLs)-len(selected)/10, n) if keep < len(otherURLs) { otherURLs = sampleStrings(otherURLs, keep) } } result := append(prosperURLs, otherURLs...) rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] }) return result } // scoreURL computes the scheduling priority for a URL. func (c *Crawler) scoreURL(lw URLWeight, siteCache map[string]*storage.SiteInfo) float64 { host := netloc(lw.URL) super := superNetloc(lw.URL) info := siteCache[host] if info == nil { info = &storage.SiteInfo{} } // Chinese-ness var chineseness float64 = 0.5 if len(info.Languages) > 0 { total := 0.0 for _, v := range info.Languages { total += v } if total > 0 { chineseness = info.Languages["zh"] / total } } // Interest decay based on visit count prosper := math.Min(62, c.prosperMap[host]) limit := prosper*500 + 50 b := math.Pow(0.1, 1/limit) interest := math.Pow(b, float64(info.VisitCount)) var interest2 float64 = 1.0 if super != host { superInfo := siteCache[super] if superInfo != nil { limit2 := math.Min(62, c.prosperMap[super])*500 + 50 b2 := math.Pow(0.1, 1/limit2) interest2 = math.Pow(b2, float64(superInfo.VisitCount)) } } quality := 1.0 if info.Quality != nil { quality = *info.Quality } prosperity := prosper if prosperity > 0 { prosperity += 0.5 } prosperity = math.Log2(2+prosperity) + 1 bad := badURL(lw.URL) return (0.1 + chineseness) * math.Min(0.05+interest, 0.05+interest2) * quality * (1 - bad) * lw.Weight * prosperity } // ---- helper functions ---- func netloc(rawURL string) string { parts := strings.SplitN(rawURL, "/", 4) if len(parts) >= 3 && (parts[0] == "http:" || parts[0] == "https:") && parts[1] == "" { return parts[2] } u, err := url.Parse(rawURL) if err != nil { return "" } return u.Host } // superNetloc returns "domain.tld" (strips subdomains). func superNetloc(rawURL string) string { host := netloc(rawURL) parts := strings.Split(host, ".") if len(parts) >= 2 { return strings.Join(parts[len(parts)-2:], ".") } return host } func badURL(u string) float64 { s := math.Max(0, float64(len(u)-30)/200.0) if strings.Contains(u, ".htm") || strings.Contains(u, ".php") { s += (1 - s) * 0.3 } if strings.Count(strings.TrimRight(u, "/"), "/") > 2 { s += (1 - s) * 0.1 } if len(u) < 5 || u[4] == ':' { s += (1 - s) * 0.3 } return math.Min(s, 0.9) } func truncate(s string, n int) string { if len(s) <= n { return s } return s[:n] } func sampleStrings(s []string, n int) []string { if len(s) <= n { return s } perm := rand.Perm(len(s)) out := make([]string, n) for i := range out { out[i] = s[perm[i]] } return out } func sampleURLWeights(s []URLWeight, n int) []URLWeight { if len(s) <= n { return s } perm := rand.Perm(len(s)) out := make([]URLWeight, n) for i := range out { out[i] = s[perm[i]] } return out } type scoredURL struct { url string score float64 } func weightedSample(items []scoredURL, k int) []string { if k >= len(items) { out := make([]string, len(items)) for i, s := range items { out[i] = s.url } return out } // Simple weighted sampling without replacement using alias method approximation totalWeight := 0.0 for _, s := range items { totalWeight += s.score } selected := make(map[int]bool) out := make([]string, 0, k) for len(out) < k && len(selected) < len(items) { r := rand.Float64() * totalWeight cum := 0.0 for i, s := range items { if selected[i] { continue } cum += s.score if cum >= r { selected[i] = true out = append(out, s.url) totalWeight -= s.score break } } } return out } func concentrationFilter(urls []string, k float64) []string { domainGroups := make(map[string][]string) shuffled := make([]string, len(urls)) copy(shuffled, urls) rand.Shuffle(len(shuffled), func(i, j int) { shuffled[i], shuffled[j] = shuffled[j], shuffled[i] }) for _, u := range shuffled { d := superNetloc(u) domainGroups[d] = append(domainGroups[d], u) } limit := 10 if len(domainGroups) > 1 { sizes := make([]int, 0, len(domainGroups)) for _, g := range domainGroups { sizes = append(sizes, int(math.Pow(float64(len(g)), k))) } // sort sizes ascending, drop last (largest) for i := 0; i < len(sizes)-1; i++ { for j := i + 1; j < len(sizes)-1; j++ { if sizes[j] < sizes[i] { sizes[i], sizes[j] = sizes[j], sizes[i] } } } total := 0 for _, s := range sizes[:len(sizes)-1] { total += s } limit = max(10, int(float64(total)*0.6)) } var result []string for _, g := range domainGroups { sn := 1 + min(limit, int(math.Pow(float64(len(g)), k))) if sn > len(g) { sn = len(g) } result = append(result, g[:sn]...) } rand.Shuffle(len(result), func(i, j int) { result[i], result[j] = result[j], result[i] }) return result } func truncateMap(m map[string]string, n int) map[string]string { if len(m) <= n { return m } out := make(map[string]string, n) i := 0 for k, v := range m { if i >= n { break } out[k] = v i++ } return out } func min(a, b int) int { if a < b { return a } return b } func max(a, b int) int { if a > b { return a } return b } // Expose Stats for monitoring. func (c *Crawler) GetStats() Stats { return Stats{ VisitedURLs: atomic.LoadInt64(&c.stats.VisitedURLs), SuccessURLs: atomic.LoadInt64(&c.stats.SuccessURLs), KeywordsFetched: atomic.LoadInt64(&c.stats.KeywordsFetched), } }