fix 分词bug，添加重爬机制

2026-04-10 00:18:07 +08:00
parent 7ab7db9b76
commit 530e2ebd9d
9 changed files with 208 additions and 34 deletions
@@ -30,12 +30,13 @@ type IndexEntry struct {
 }

 // SnippetEntry 是 URL 对应的摘要信息缓存。
-// 包含页面标题、描述、正文片段和抓取时间戳。
+// 包含页面标题、描述、正文片段、抓取时间戳和内容哈希（用于增量重爬检测）。
 type SnippetEntry struct {
 	Title       string `json:"title"`        // 网页标题
 	Description string `json:"desc"`         // meta description 或自动生成的描述
 	Text        string `json:"text"`         // 正文前 N 字符的文本片段
 	Timestamp   int64  `json:"ts"`           // 抓取该页面时的 Unix 时间戳
+	ContentHash string `json:"hash"`         // 正文内容的 FNV-1a 哈希（用于增量重爬判断内容是否变化）
 }

 // 四个 bbolt bucket 的名称（以字节数组存储，bbolt 要求 key/value 均为字节）