up
This commit is contained in:
+1
-2
@@ -399,8 +399,7 @@ func (c *Crawler) sendToHarvester(finalURL string, kws []analyzer.Keyword) {
|
|||||||
if failures >= circuitFailureThreshold {
|
if failures >= circuitFailureThreshold {
|
||||||
atomic.StoreInt32(&c.circuitState, circuitOpen)
|
atomic.StoreInt32(&c.circuitState, circuitOpen)
|
||||||
atomic.StoreInt64(&c.circuitExpiry, now+int64(circuitCooldownSeconds))
|
atomic.StoreInt64(&c.circuitExpiry, now+int64(circuitCooldownSeconds))
|
||||||
log.Printf("[crawler] circuit OPEN: harvest endpoint unreachable (%d failures), cooling for %ds",
|
//log.Printf("[crawler] circuit OPEN: harvest endpoint unreachable (%d failures), cooling for %ds",failures, circuitCooldownSeconds)
|
||||||
failures, circuitCooldownSeconds)
|
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|||||||
Vendored
+2
File diff suppressed because one or more lines are too long
Vendored
+6
File diff suppressed because one or more lines are too long
Vendored
+6
File diff suppressed because one or more lines are too long
Vendored
+6
File diff suppressed because one or more lines are too long
Vendored
+2
File diff suppressed because one or more lines are too long
Vendored
+6
File diff suppressed because one or more lines are too long
Vendored
+2
-2
@@ -5,8 +5,8 @@
|
|||||||
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
<link rel="icon" type="image/svg+xml" href="/vite.svg" />
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||||||
<title>SESE 爬取管理</title>
|
<title>SESE 爬取管理</title>
|
||||||
<script type="module" crossorigin src="/assets/index-KhES34ts.js"></script>
|
<script type="module" crossorigin src="/assets/index-BomiJv32.js"></script>
|
||||||
<link rel="stylesheet" crossorigin href="/assets/index-CO25_jVn.css">
|
<link rel="stylesheet" crossorigin href="/assets/index-Bj4UMEhQ.css">
|
||||||
</head>
|
</head>
|
||||||
<body>
|
<body>
|
||||||
<div id="app"></div>
|
<div id="app"></div>
|
||||||
|
|||||||
+17
-2
@@ -292,6 +292,7 @@ func (s *Server) handleAdminStats(w http.ResponseWriter, r *http.Request) {
|
|||||||
resp := map[string]any{
|
resp := map[string]any{
|
||||||
"total_urls": total,
|
"total_urls": total,
|
||||||
"total_words": totalWords,
|
"total_words": totalWords,
|
||||||
|
"total_domains": len(domainCount), // 真实的域名总数(非Top 20)
|
||||||
"domains": domainsMap,
|
"domains": domainsMap,
|
||||||
"languages": langsMap,
|
"languages": langsMap,
|
||||||
"pending": atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数
|
"pending": atomic.LoadInt64(&s.rowCount), // 内存中未刷盘的索引条目数
|
||||||
@@ -1232,11 +1233,12 @@ func (s *Server) flush() {
|
|||||||
s.mem = make(map[string][]storage.IndexEntry)
|
s.mem = make(map[string][]storage.IndexEntry)
|
||||||
atomic.StoreInt64(&s.rowCount, 0)
|
atomic.StoreInt64(&s.rowCount, 0)
|
||||||
s.memMu.Unlock()
|
s.memMu.Unlock()
|
||||||
log.Printf("[harvester] flushing %d keys", len(snapshot))
|
totalKeys := len(snapshot)
|
||||||
|
log.Printf("[harvester] flushing %d keys", totalKeys)
|
||||||
items := make([]struct {
|
items := make([]struct {
|
||||||
key string
|
key string
|
||||||
entries []storage.IndexEntry
|
entries []storage.IndexEntry
|
||||||
}, 0, len(snapshot))
|
}, 0, totalKeys)
|
||||||
for k, v := range snapshot {
|
for k, v := range snapshot {
|
||||||
items = append(items, struct {
|
items = append(items, struct {
|
||||||
key string
|
key string
|
||||||
@@ -1250,6 +1252,14 @@ func (s *Server) flush() {
|
|||||||
}
|
}
|
||||||
results := make(chan result, len(items))
|
results := make(chan result, len(items))
|
||||||
sem := make(chan struct{}, 8)
|
sem := make(chan struct{}, 8)
|
||||||
|
processed := int64(0)
|
||||||
|
progressInterval := 1000
|
||||||
|
if totalKeys < 10000 {
|
||||||
|
progressInterval = totalKeys / 10
|
||||||
|
}
|
||||||
|
if progressInterval < 1 {
|
||||||
|
progressInterval = 1
|
||||||
|
}
|
||||||
for _, item := range items {
|
for _, item := range items {
|
||||||
sem <- struct{}{}
|
sem <- struct{}{}
|
||||||
go func(k string, newEntries []storage.IndexEntry) {
|
go func(k string, newEntries []storage.IndexEntry) {
|
||||||
@@ -1262,6 +1272,11 @@ func (s *Server) flush() {
|
|||||||
for range items {
|
for range items {
|
||||||
r := <-results
|
r := <-results
|
||||||
batch[r.key] = r.entries
|
batch[r.key] = r.entries
|
||||||
|
current := atomic.AddInt64(&processed, 1)
|
||||||
|
if int(current)%progressInterval == 0 || int(current) == totalKeys {
|
||||||
|
percent := float64(current) * 100 / float64(totalKeys)
|
||||||
|
log.Printf("[harvester] flush progress: %d/%d (%.1f%%)", current, totalKeys, percent)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if err := s.db.BatchSetIndex(batch); err != nil {
|
if err := s.db.BatchSetIndex(batch); err != nil {
|
||||||
log.Printf("[harvester] flush write error: %v", err)
|
log.Printf("[harvester] flush write error: %v", err)
|
||||||
|
|||||||
Reference in New Issue
Block a user