up
This commit is contained in:
+83
-77
@@ -238,16 +238,15 @@ func (c *Crawler) visitURL(rawURL string) (hrefs []string) {
|
||||
if fromHost == "" {
|
||||
continue
|
||||
}
|
||||
info, _ := c.db.GetSiteInfo(fromHost)
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
info.Redirects[from] = to
|
||||
// 重定向映射过多时裁剪到 40 条
|
||||
if len(info.Redirects) > 50 {
|
||||
info.Redirects = truncateMap(info.Redirects, 40)
|
||||
}
|
||||
_ = c.db.SetSiteInfo(fromHost, info)
|
||||
_ = c.db.UpdateSiteInfo(fromHost, func(info *storage.SiteInfo) {
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
info.Redirects[from] = to
|
||||
if len(info.Redirects) > 50 {
|
||||
info.Redirects = truncateMap(info.Redirects, 40)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// 限制返回的链接数,防止下一轮队列爆炸
|
||||
@@ -263,87 +262,94 @@ func (c *Crawler) updateSiteFailure(rawURL string) {
|
||||
if host == "" {
|
||||
return
|
||||
}
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
if info.SuccessRate == nil {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
}
|
||||
// 成功率每次失败乘以 0.99(无限趋近 0)
|
||||
*info.SuccessRate *= 0.99
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
if info.SuccessRate == nil {
|
||||
zero := 0.0
|
||||
info.SuccessRate = &zero
|
||||
}
|
||||
// 成功率每次失败乘以 0.99(无限趋近 0)
|
||||
*info.SuccessRate *= 0.99
|
||||
})
|
||||
}
|
||||
|
||||
// updateSiteSuccess 当某 URL 抓取成功时,更新网站的完整元信息。
|
||||
// 使用 UpdateSiteInfo 原子读-改-写,避免并发 goroutine 对同一 host 的 SiteInfo 更新丢失。
|
||||
func (c *Crawler) updateSiteSuccess(host string, res *FetchResult, title, desc, text string, hrefs []string) {
|
||||
info, _ := c.db.GetSiteInfo(host)
|
||||
now := time.Now().Unix()
|
||||
httpsAvailable := strings.HasPrefix(res.FinalURL, "https://")
|
||||
serverType := res.ServerType
|
||||
|
||||
// 访问计数 +1,更新最后访问时间
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = time.Now().Unix()
|
||||
// 语言检测(CPU 密集,在锁外执行)
|
||||
var detectedLang string
|
||||
// 检测条件在 UpdateSiteInfo 回调内判断,这里预先计算好
|
||||
detectedLang = c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
||||
|
||||
// 成功率更新:EWM(指数加权移动)平滑,每次 +0.01
|
||||
one := 1.0
|
||||
if info.SuccessRate == nil {
|
||||
info.SuccessRate = &one
|
||||
}
|
||||
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
||||
|
||||
// 记录是否支持 HTTPS
|
||||
if strings.HasPrefix(res.FinalURL, "https://") {
|
||||
t := true
|
||||
info.HTTPSAvailable = &t
|
||||
}
|
||||
|
||||
// 记录 HTTP Server 类型(去重,保留最近 5 个)
|
||||
if res.ServerType != "" {
|
||||
found := false
|
||||
for _, s := range info.ServerTypes {
|
||||
if s == res.ServerType {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
info.ServerTypes = append(info.ServerTypes, res.ServerType)
|
||||
if len(info.ServerTypes) > 5 {
|
||||
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
|
||||
}
|
||||
// 收集外链(跨顶级域名的链接)
|
||||
superHost := superNetloc(res.FinalURL)
|
||||
var external []string
|
||||
for _, h := range hrefs {
|
||||
if superNetloc(h) != superHost {
|
||||
external = append(external, h)
|
||||
}
|
||||
}
|
||||
sampled := sampleStrings(external, 10)
|
||||
|
||||
// 语言检测和出站链接收集(仅在前 10 次访问或 10% 概率下触发,减少开销)
|
||||
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
||||
lang := c.analyzer.DetectLanguage(title + " " + desc + " " + text)
|
||||
if lang != "" {
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
// 首次访问强度高,随访问次数增加强度衰减
|
||||
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
||||
for k := range info.Languages {
|
||||
info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
|
||||
}
|
||||
info.Languages[lang] += intensity // 新语种增加
|
||||
_ = c.db.UpdateSiteInfo(host, func(info *storage.SiteInfo) {
|
||||
// 访问计数 +1,更新最后访问时间
|
||||
info.VisitCount++
|
||||
info.LastVisitTime = now
|
||||
|
||||
// 成功率更新:EWM(指数加权移动)平滑,每次 +0.01
|
||||
one := 1.0
|
||||
if info.SuccessRate == nil {
|
||||
info.SuccessRate = &one
|
||||
}
|
||||
*info.SuccessRate = *info.SuccessRate*0.99 + 0.01
|
||||
|
||||
// 记录是否支持 HTTPS
|
||||
if httpsAvailable {
|
||||
t := true
|
||||
info.HTTPSAvailable = &t
|
||||
}
|
||||
|
||||
// 收集外链(跨顶级域名的链接)
|
||||
superHost := superNetloc(res.FinalURL)
|
||||
var external []string
|
||||
for _, h := range hrefs {
|
||||
if superNetloc(h) != superHost {
|
||||
external = append(external, h)
|
||||
// 记录 HTTP Server 类型(去重,保留最近 5 个)
|
||||
if serverType != "" {
|
||||
found := false
|
||||
for _, s := range info.ServerTypes {
|
||||
if s == serverType {
|
||||
found = true
|
||||
break
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
info.ServerTypes = append(info.ServerTypes, serverType)
|
||||
if len(info.ServerTypes) > 5 {
|
||||
info.ServerTypes = info.ServerTypes[len(info.ServerTypes)-5:]
|
||||
}
|
||||
}
|
||||
}
|
||||
// 最多保留 10 条外链
|
||||
sampled := sampleStrings(external, 10)
|
||||
info.OutLinks = append(info.OutLinks, sampled...)
|
||||
// 外链超过 250 条时采样到 200 条
|
||||
if len(info.OutLinks) > 250 {
|
||||
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
||||
}
|
||||
}
|
||||
|
||||
_ = c.db.SetSiteInfo(host, info)
|
||||
// 语言检测和出站链接收集(仅在前 10 次访问或 10% 概率下触发,减少开销)
|
||||
if info.VisitCount < 10 || rand.Float64() < 0.1 {
|
||||
if detectedLang != "" {
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
// 首次访问强度高,随访问次数增加强度衰减
|
||||
intensity := math.Min(0.2, 1/math.Sqrt(float64(info.VisitCount+1)))
|
||||
for k := range info.Languages {
|
||||
info.Languages[k] *= (1 - intensity) // 旧语种按 intensity 衰减
|
||||
}
|
||||
info.Languages[detectedLang] += intensity // 新语种增加
|
||||
}
|
||||
|
||||
// 外链
|
||||
info.OutLinks = append(info.OutLinks, sampled...)
|
||||
if len(info.OutLinks) > 250 {
|
||||
info.OutLinks = sampleStrings(info.OutLinks, 200)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
// sendToHarvester 将关键词索引数据通过 HTTP POST 发送到搜索服务器(/l 端点)。
|
||||
|
||||
Reference in New Issue
Block a user