This commit is contained in:
2026-04-09 11:58:53 +08:00
parent ce2a91d9f5
commit 18b1c4df5e
2 changed files with 159 additions and 77 deletions
+83 -77
View File
@@ -238,16 +238,15 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
if fromHost == "" {
continue
}
info, _ := c.db.GetSiteInfo(fromHost)
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
info.Redirects[from] = to
// 重定向映射过多时裁剪到 40 条
if len(info.Redirects) > 50 {
info.Redirects = truncateMap(info.Redirects, 40)
}
_ = c.db.SetSiteInfo(fromHost, info)
_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
info.Redirects[from] = to
if len(info.Redirects) > 50 {
info.Redirects = truncateMap(info.Redirects, 40)
}
})
}
// 限制返回的链接数,防止下一轮队列爆炸
@@ -263,87 +262,94 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
if host == "" {
return
}
info, _ := c.db.GetSiteInfo(host)
if info.SuccessRate == nil {
zero := 0.0
info.SuccessRate = &zero
}
// 成功率每次失败乘以 0.99(无限趋近 0)
*info.SuccessRate *= 0.99
_ = c.db.SetSiteInfo(host, info)
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
if info.SuccessRate == nil {
zero := 0.0
info.SuccessRate = &zero
}
// 成功率每次失败乘以 0.99(无限趋近 0)
*info.SuccessRate *= 0.99
})
}
// updateSiteSuccess 当某 URL 抓取成功时,更新网站的完整元信息。
// 使用 UpdateSiteInfo 原子读-改-写,避免并发 goroutine 对同一 host 的 SiteInfo 更新丢失。
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
info, _ := c.db.GetSiteInfo(host)
now := time.Now().Unix()
httpsAvailable := strings.HasPrefix(res.FinalURL, "https://")
serverType := res.ServerType
// 访问计数 +1,更新最后访问时间
info.VisitCount++
info.LastVisitTime = time.Now().Unix()
// 语言检测(CPU 密集,在锁外执行)
var detectedLang string
// 检测条件在 UpdateSiteInfo 回调内判断,这里预先计算好
detectedLang = c.analyzer.DetectLanguage(title + " " + desc + " " + text)
// 成功率更新:EWM(指数加权移动)平滑,每次 +0.01
one := 1.0
if info.SuccessRate == nil {
info.SuccessRate = &one
}
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
// 记录是否支持 HTTPS
if strings.HasPrefix(res.FinalURL, "https://") {
t := true
info.HTTPSAvailable = &t
}
// 记录 HTTP Server 类型(去重,保留最近 5 个)
if res.ServerType != "" {
found := false
for _, s := range info.ServerTypes {
if s == res.ServerType {
found = true
break
}
}
if !found {
info.ServerTypes = append(info.ServerTypes, res.ServerType)
if len(info.ServerTypes) > 5 {
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
}
// 收集外链(跨顶级域名的链接)
superHost := superNetloc(res.FinalURL)
var external []string
for _, h := range hrefs {
if superNetloc(h) != superHost {
external = append(external, h)
}
}
sampled := sampleStrings(external, 10)
// 语言检测和出站链接收集(仅在前 10 次访问或 10% 概率下触发,减少开销)
if info.VisitCount < 10 || rand.Float64() < 0.1 {
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
if lang != "" {
if info.Languages == nil {
info.Languages = make(map[string]float64)
}
// 首次访问强度高,随访问次数增加强度衰减
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
for k := range info.Languages {
info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
}
info.Languages[lang] += intensity // 新语种增加
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
// 访问计数 +1,更新最后访问时间
info.VisitCount++
info.LastVisitTime = now
// 成功率更新:EWM(指数加权移动)平滑,每次 +0.01
one := 1.0
if info.SuccessRate == nil {
info.SuccessRate = &one
}
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
// 记录是否支持 HTTPS
if httpsAvailable {
t := true
info.HTTPSAvailable = &t
}
// 收集外链(跨顶级域名的链接
superHost := superNetloc(res.FinalURL)
var external []string
for _, h := range hrefs {
if superNetloc(h) != superHost {
external = append(external, h)
// 记录 HTTP Server 类型(去重,保留最近 5 个
if serverType != "" {
found := false
for _, s := range info.ServerTypes {
if s == serverType {
found = true
break
}
}
if !found {
info.ServerTypes = append(info.ServerTypes, serverType)
if len(info.ServerTypes) > 5 {
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
}
}
}
// 最多保留 10 条外链
sampled := sampleStrings(external, 10)
info.OutLinks = append(info.OutLinks, sampled...)
// 外链超过 250 条时采样到 200 条
if len(info.OutLinks) > 250 {
info.OutLinks = sampleStrings(info.OutLinks, 200)
}
}
_ = c.db.SetSiteInfo(host, info)
// 语言检测和出站链接收集(仅在前 10 次访问或 10% 概率下触发,减少开销)
if info.VisitCount < 10 || rand.Float64() < 0.1 {
if detectedLang != "" {
if info.Languages == nil {
info.Languages = make(map[string]float64)
}
// 首次访问强度高,随访问次数增加强度衰减
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
for k := range info.Languages {
info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
}
info.Languages[detectedLang] += intensity // 新语种增加
}
// 外链
info.OutLinks = append(info.OutLinks, sampled...)
if len(info.OutLinks) > 250 {
info.OutLinks = sampleStrings(info.OutLinks, 200)
}
}
})
}
// sendToHarvester 将关键词索引数据通过 HTTP POST 发送到搜索服务器(/l 端点)。