This commit is contained in:
2026-04-09 00:14:55 +08:00
parent 223177b1dd
commit 439d0c1cb6
9 changed files with 88 additions and 46 deletions
+2 -3
View File
@@ -399,8 +399,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
if failures >= circuitFailureThreshold { if failures >= circuitFailureThreshold {
atomic.StoreInt32(&c.circuitState, circuitOpen) atomic.StoreInt32(&c.circuitState, circuitOpen)
atomic.StoreInt64(&c.circuitExpiry, now+int64(circuitCooldownSeconds)) atomic.StoreInt64(&c.circuitExpiry, now+int64(circuitCooldownSeconds))
log.Printf("[crawler] circuit OPEN: harvest endpoint unreachable (%d failures), cooling for %ds", //log.Printf("[crawler] circuit OPEN: harvest endpoint unreachable (%d failures), cooling for %ds",failures, circuitCooldownSeconds)
failures, circuitCooldownSeconds)
} }
return return
} }
@@ -482,7 +481,7 @@ func (c *Crawler) schedule(links []URLWeight) []string {
} }
// 根据目标繁荣占比计算普通 URL 应保留数量 // 根据目标繁荣占比计算普通 URL 应保留数量
expectedProsperRatio := config.ExpectedProsperRatio() expectedProsperRatio := config.ExpectedProsperRatio()
n := int(float64(len(prosperURLs)) * (1-expectedProsperRatio) / expectedProsperRatio) n := int(float64(len(prosperURLs)) * (1 - expectedProsperRatio) / expectedProsperRatio)
if len(otherURLs) > n { if len(otherURLs) > n {
keep := max(len(otherURLs)-len(selected)/10, n) keep := max(len(otherURLs)-len(selected)/10, n)
if keep < len(otherURLs) { if keep < len(otherURLs) {
File diff suppressed because one or more lines are too long
+6
View File
File diff suppressed because one or more lines are too long
+6
View File
File diff suppressed because one or more lines are too long
+6
View File
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
+6
View File
File diff suppressed because one or more lines are too long
+2 -2
View File
@@ -5,8 +5,8 @@
<link rel="icon" type="image/svg+xml" href="/vite.svg" /> <link rel="icon" type="image/svg+xml" href="/vite.svg" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /> <meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>SESE 爬取管理</title> <title>SESE 爬取管理</title>
<script type="module" crossorigin src="/assets/index-KhES34ts.js"></script> <script type="module" crossorigin src="/assets/index-BomiJv32.js"></script>
<link rel="stylesheet" crossorigin href="/assets/index-CO25_jVn.css"> <link rel="stylesheet" crossorigin href="/assets/index-Bj4UMEhQ.css">
</head> </head>
<body> <body>
<div id="app"></div> <div id="app"></div>
+17 -2
View File
@@ -292,6 +292,7 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
resp := map[string]any{ resp := map[string]any{
"total_urls": total, "total_urls": total,
"total_words": totalWords, "total_words": totalWords,
"total_domains": len(domainCount), // 真实的域名总数(非Top 20
"domains": domainsMap, "domains": domainsMap,
"languages": langsMap, "languages": langsMap,
"pending": atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数 "pending": atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数
@@ -1232,11 +1233,12 @@ func (s *Server) flush() {
s.mem = make(map[string][]storage.IndexEntry) s.mem = make(map[string][]storage.IndexEntry)
atomic.StoreInt64(&s.rowCount, 0) atomic.StoreInt64(&s.rowCount, 0)
s.memMu.Unlock() s.memMu.Unlock()
log.Printf("[harvester] flushing %d keys", len(snapshot)) totalKeys := len(snapshot)
log.Printf("[harvester] flushing %d keys", totalKeys)
items := make([]struct { items := make([]struct {
key string key string
entries []storage.IndexEntry entries []storage.IndexEntry
}, 0, len(snapshot)) }, 0, totalKeys)
for k, v := range snapshot { for k, v := range snapshot {
items = append(items, struct { items = append(items, struct {
key string key string
@@ -1250,6 +1252,14 @@ func (s *Server) flush() {
} }
results := make(chan result, len(items)) results := make(chan result, len(items))
sem := make(chan struct{}, 8) sem := make(chan struct{}, 8)
processed := int64(0)
progressInterval := 1000
if totalKeys < 10000 {
progressInterval = totalKeys / 10
}
if progressInterval < 1 {
progressInterval = 1
}
for _, item := range items { for _, item := range items {
sem <- struct{}{} sem <- struct{}{}
go func(k string, newEntries []storage.IndexEntry) { go func(k string, newEntries []storage.IndexEntry) {
@@ -1262,6 +1272,11 @@ func (s *Server) flush() {
for range items { for range items {
r := <-results r := <-results
batch[r.key] = r.entries batch[r.key] = r.entries
current := atomic.AddInt64(&processed, 1)
if int(current)%progressInterval == 0 || int(current) == totalKeys {
percent := float64(current) * 100 / float64(totalKeys)
log.Printf("[harvester] flush progress: %d/%d (%.1f%%)", current, totalKeys, percent)
}
} }
if err := s.db.BatchSetIndex(batch); err != nil { if err := s.db.BatchSetIndex(batch); err != nil {
log.Printf("[harvester] flush write error: %v", err) log.Printf("[harvester] flush write error: %v", err)