Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1,300 @@
|
||||
// Package storage provides the persistent index and site-info storage backed by bbolt.
|
||||
//
|
||||
// Index space → a single bbolt bucket "index" where key = keyword (string),
|
||||
// value = brotli-compressed JSON array of [weight, url] pairs.
|
||||
//
|
||||
// Gate (门) → a bbolt bucket "gate" where key = URL (string),
|
||||
// value = brotli-compressed JSON array [title, desc, text, timestamp].
|
||||
//
|
||||
// SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string),
|
||||
// value = brotli-compressed JSON of SiteInfo struct.
|
||||
//
|
||||
// The Python version used a custom hash-bucket scheme; here bbolt handles it natively.
|
||||
package storage
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/andybalholm/brotli"
|
||||
bolt "go.etcd.io/bbolt"
|
||||
)
|
||||
|
||||
// IndexEntry is a single entry in the inverted index.
|
||||
type IndexEntry struct {
|
||||
Weight float32 `json:"w"`
|
||||
URL string `json:"u"`
|
||||
}
|
||||
|
||||
// SnippetEntry is cached snippet data for a URL.
|
||||
type SnippetEntry struct {
|
||||
Title string `json:"title"`
|
||||
Description string `json:"desc"`
|
||||
Text string `json:"text"`
|
||||
Timestamp int64 `json:"ts"`
|
||||
}
|
||||
|
||||
var (
|
||||
bucketIndex = []byte("index")
|
||||
bucketGate = []byte("gate")
|
||||
bucketSiteGate = []byte("site_gate")
|
||||
)
|
||||
|
||||
// DB wraps a bbolt database and exposes typed access methods.
|
||||
// bbolt handles its own locking internally.
|
||||
type DB struct {
|
||||
db *bolt.DB
|
||||
}
|
||||
|
||||
// Open creates or opens the bbolt database at the given directory path.
|
||||
func Open(dir string) (*DB, error) {
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
|
||||
}
|
||||
path := filepath.Join(dir, "sese.db")
|
||||
db, err := bolt.Open(path, 0o600, nil)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("storage.Open bolt: %w", err)
|
||||
}
|
||||
// Ensure buckets exist
|
||||
err = db.Update(func(tx *bolt.Tx) error {
|
||||
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
|
||||
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("storage.Open create buckets: %w", err)
|
||||
}
|
||||
return &DB{db: db}, nil
|
||||
}
|
||||
|
||||
// Close closes the underlying bbolt database.
|
||||
func (d *DB) Close() error {
|
||||
return d.db.Close()
|
||||
}
|
||||
|
||||
// ---- helpers ----
|
||||
|
||||
func compress(data []byte) ([]byte, error) {
|
||||
buf := make([]byte, 0, len(data))
|
||||
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6)
|
||||
if _, err := w.Write(data); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if err := w.Close(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return buf, nil
|
||||
}
|
||||
|
||||
func decompress(data []byte) ([]byte, error) {
|
||||
r := brotli.NewReader(
|
||||
(*byteReader)(&data),
|
||||
)
|
||||
out := make([]byte, 0, len(data)*3)
|
||||
tmp := make([]byte, 4096)
|
||||
for {
|
||||
n, err := r.Read(tmp)
|
||||
out = append(out, tmp[:n]...)
|
||||
if err != nil {
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
return out, err
|
||||
}
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
// appendWriter implements io.Writer on top of a *[]byte.
|
||||
type appendWriter []byte
|
||||
|
||||
func (a *appendWriter) Write(p []byte) (int, error) {
|
||||
*a = append(*a, p...)
|
||||
return len(p), nil
|
||||
}
|
||||
|
||||
// byteReader wraps []byte as io.Reader.
|
||||
type byteReader []byte
|
||||
|
||||
func (b *byteReader) Read(p []byte) (int, error) {
|
||||
if len(*b) == 0 {
|
||||
return 0, io.EOF
|
||||
}
|
||||
n := copy(p, *b)
|
||||
*b = (*b)[n:]
|
||||
return n, nil
|
||||
}
|
||||
|
||||
func marshalCompress(v any) ([]byte, error) {
|
||||
raw, err := json.Marshal(v)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return compress(raw)
|
||||
}
|
||||
|
||||
func decompressUnmarshal(data []byte, v any) error {
|
||||
raw, err := decompress(data)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return json.Unmarshal(raw, v)
|
||||
}
|
||||
|
||||
// ---- Index (inverted index) ----
|
||||
|
||||
// GetIndex retrieves all IndexEntry values for a keyword.
|
||||
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
|
||||
var entries []IndexEntry
|
||||
err := d.db.View(func(tx *bolt.Tx) error {
|
||||
b := tx.Bucket(bucketIndex)
|
||||
v := b.Get([]byte(keyword))
|
||||
if v == nil {
|
||||
return nil
|
||||
}
|
||||
return decompressUnmarshal(v, &entries)
|
||||
})
|
||||
return entries, err
|
||||
}
|
||||
|
||||
// SetIndex overwrites the IndexEntry list for a keyword.
|
||||
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
|
||||
data, err := marshalCompress(entries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketIndex).Put([]byte(keyword), data)
|
||||
})
|
||||
}
|
||||
|
||||
// BatchSetIndex writes multiple keyword→entries pairs in one transaction.
|
||||
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
b := tx.Bucket(bucketIndex)
|
||||
for keyword, entries := range batch {
|
||||
data, err := marshalCompress(entries)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := b.Put([]byte(keyword), data); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
})
|
||||
}
|
||||
|
||||
// ForEachIndex iterates over all index entries. fn receives keyword and entries.
|
||||
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
|
||||
return d.db.View(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
|
||||
var entries []IndexEntry
|
||||
if err := decompressUnmarshal(v, &entries); err != nil {
|
||||
return nil // skip corrupted entries
|
||||
}
|
||||
return fn(string(k), entries)
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
// ---- Gate (URL snippet cache) ----
|
||||
|
||||
// GetSnippet retrieves the cached snippet for a URL.
|
||||
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
|
||||
var entry SnippetEntry
|
||||
err := d.db.View(func(tx *bolt.Tx) error {
|
||||
v := tx.Bucket(bucketGate).Get([]byte(url))
|
||||
if v == nil {
|
||||
return fmt.Errorf("not found")
|
||||
}
|
||||
return decompressUnmarshal(v, &entry)
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return &entry, nil
|
||||
}
|
||||
|
||||
// SetSnippet stores a cached snippet for a URL.
|
||||
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
|
||||
data, err := marshalCompress(entry)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketGate).Put([]byte(url), data)
|
||||
})
|
||||
}
|
||||
|
||||
// ---- SiteGate (site metadata) ----
|
||||
|
||||
// SiteInfo mirrors the Python 网站 dataclass.
|
||||
type SiteInfo struct {
|
||||
VisitCount int `json:"visit_count"`
|
||||
LastVisitTime int64 `json:"last_visit_time"`
|
||||
Fingerprint any `json:"fingerprint,omitempty"`
|
||||
SuccessRate *float64 `json:"success_rate,omitempty"`
|
||||
HTMLStructure string `json:"html_structure,omitempty"`
|
||||
IPs []string `json:"ips,omitempty"`
|
||||
Quality *float64 `json:"quality,omitempty"`
|
||||
HTTPSAvailable *bool `json:"https_available,omitempty"`
|
||||
Keywords []string `json:"keywords,omitempty"`
|
||||
OutLinks []string `json:"out_links,omitempty"`
|
||||
Languages map[string]float64 `json:"languages,omitempty"`
|
||||
Redirects map[string]string `json:"redirects,omitempty"`
|
||||
ServerTypes []string `json:"server_types,omitempty"`
|
||||
}
|
||||
|
||||
// GetSiteInfo retrieves metadata for a hostname.
|
||||
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
||||
var info SiteInfo
|
||||
err := d.db.View(func(tx *bolt.Tx) error {
|
||||
v := tx.Bucket(bucketSiteGate).Get([]byte(host))
|
||||
if v == nil {
|
||||
return fmt.Errorf("not found")
|
||||
}
|
||||
return decompressUnmarshal(v, &info)
|
||||
})
|
||||
if err != nil {
|
||||
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
|
||||
}
|
||||
if info.Languages == nil {
|
||||
info.Languages = make(map[string]float64)
|
||||
}
|
||||
if info.Redirects == nil {
|
||||
info.Redirects = make(map[string]string)
|
||||
}
|
||||
return &info, nil
|
||||
}
|
||||
|
||||
// SetSiteInfo stores metadata for a hostname.
|
||||
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
|
||||
data, err := marshalCompress(info)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return d.db.Update(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketSiteGate).Put([]byte(host), data)
|
||||
})
|
||||
}
|
||||
|
||||
// ForEachSite iterates over all site metadata entries.
|
||||
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
|
||||
return d.db.View(func(tx *bolt.Tx) error {
|
||||
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
|
||||
var info SiteInfo
|
||||
if err := decompressUnmarshal(v, &info); err != nil {
|
||||
return nil
|
||||
}
|
||||
return fn(string(k), &info)
|
||||
})
|
||||
})
|
||||
}
|
||||
Reference in New Issue
Block a user