修复一个卡死问题

This commit is contained in:
2026-04-08 18:44:51 +08:00
parent c154abf410
commit 1d3570a505
3 changed files with 231 additions and 7 deletions
+146
View File
@@ -11,6 +11,7 @@ import (
"net/url" // URL 解析
"regexp" // 正则表达式(site: 过滤语法)
"sort" // 排序
"strconv" // 字符串转整数
"strings" // 字符串操作
"sync" // 互斥锁(保护并发切片写入)
"time" // 时间戳
@@ -46,6 +47,8 @@ func New(db *storage.DB, infoSvc *info.Service, a *analyzer.Analyzer) *Server {
func (s *Server) Handler() http.Handler {
mux := http.NewServeMux()
mux.HandleFunc("/search", s.handleSearch)
mux.HandleFunc("/admin/recent", s.handleAdminRecent)
mux.HandleFunc("/admin/stats", s.handleAdminStats)
return mux
}
@@ -55,6 +58,149 @@ func (s *Server) ListenAndServe(addr string) error {
return http.ListenAndServe(addr, s.Handler())
}
// ---- Admin 接口 ----
// recentItem 是 /admin/recent 接口返回的单条记录。
type recentItem struct {
URL string `json:"url"`
Title string `json:"title"`
Description string `json:"description"`
Domain string `json:"domain"`
Language map[string]float64 `json:"language"`
WordCount int `json:"word_count"`
CrawledAt int64 `json:"crawled_at"`
}
// handleAdminRecent 返回最近爬取的条目列表,按爬取时间倒序。
// 参数:limit(默认50,最大200)。
func (s *Server) handleAdminRecent(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
limit := 50
if l := r.URL.Query().Get("limit"); l != "" {
if v, err := strconv.Atoi(l); err == nil && v > 0 {
limit = v
}
}
if limit > 200 {
limit = 200
}
type entry struct {
url string
snippet *storage.SnippetEntry
siteInfo *storage.SiteInfo
}
var items []entry
s.db.ForEachSnippet(func(url string, snippet *storage.SnippetEntry) error {
siteInfo, _ := s.db.GetSiteInfo(netloc(url))
items = append(items, entry{url, snippet, siteInfo})
return nil
})
// 按时间倒序
sort.Slice(items, func(i, j int) bool {
return items[i].snippet.Timestamp > items[j].snippet.Timestamp
})
if len(items) > limit {
items = items[:limit]
}
result := make([]recentItem, 0, len(items))
for _, e := range items {
lang := e.siteInfo.Languages
if lang == nil {
lang = make(map[string]float64)
}
desc := e.snippet.Description
if len(desc) > 200 {
desc = desc[:200]
}
result = append(result, recentItem{
URL: e.url,
Title: e.snippet.Title,
Description: desc,
Domain: netloc(e.url),
Language: lang,
WordCount: len(e.snippet.Text),
CrawledAt: e.snippet.Timestamp,
})
}
resp := map[string]any{
"items": result,
"total": len(items),
}
json.NewEncoder(w).Encode(resp)
}
// handleAdminStats 返回全局统计:域名分布、语种分布、总 URL 数、总词数。
func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Access-Control-Allow-Origin", "*")
w.Header().Set("Content-Type", "application/json; charset=utf-8")
domainCount := make(map[string]int)
langCount := make(map[string]int)
totalWords := 0
total := 0
s.db.ForEachSnippet(func(url string, snippet *storage.SnippetEntry) error {
total++
domain := netloc(url)
domainCount[domain]++
totalWords += len(snippet.Text)
siteInfo, _ := s.db.GetSiteInfo(domain)
if siteInfo != nil {
for lang, ratio := range siteInfo.Languages {
if ratio > 0.1 {
langCount[lang]++
}
}
}
return nil
})
// 排序取 Top
type kv struct{ k string; v int }
topDomains := make([]kv, 0, len(domainCount))
for k, v := range domainCount {
topDomains = append(topDomains, kv{k, v})
}
sort.Slice(topDomains, func(i, j int) bool { return topDomains[i].v > topDomains[j].v })
if len(topDomains) > 20 {
topDomains = topDomains[:20]
}
topLangs := make([]kv, 0, len(langCount))
for k, v := range langCount {
topLangs = append(topLangs, kv{k, v})
}
sort.Slice(topLangs, func(i, j int) bool { return topLangs[i].v > topLangs[j].v })
if len(topLangs) > 10 {
topLangs = topLangs[:10]
}
domainsMap := make(map[string]int)
for _, kv := range topDomains {
domainsMap[kv.k] = kv.v
}
langsMap := make(map[string]int)
for _, kv := range topLangs {
langsMap[kv.k] = kv.v
}
resp := map[string]any{
"total_urls": total,
"total_words": totalWords,
"domains": domainsMap,
"languages": langsMap,
}
json.NewEncoder(w).Encode(resp)
}
// ---- 搜索处理器 ----
// searchResponse 是搜索 API 的 JSON 响应结构。