Files
sese-engine-go/storage/storage.go
T

301 lines
7.9 KiB
Go

// Package storage provides the persistent index and site-info storage backed by bbolt.
//
// Index space → a single bbolt bucket "index" where key = keyword (string),
// value = brotli-compressed JSON array of [weight, url] pairs.
//
// Gate (门) → a bbolt bucket "gate" where key = URL (string),
// value = brotli-compressed JSON array [title, desc, text, timestamp].
//
// SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string),
// value = brotli-compressed JSON of SiteInfo struct.
//
// The Python version used a custom hash-bucket scheme; here bbolt handles it natively.
package storage
import (
"encoding/json"
"fmt"
"io"
"os"
"path/filepath"
"github.com/andybalholm/brotli"
bolt "go.etcd.io/bbolt"
)
// IndexEntry is a single entry in the inverted index.
type IndexEntry struct {
Weight float32 `json:"w"`
URL string `json:"u"`
}
// SnippetEntry is cached snippet data for a URL.
type SnippetEntry struct {
Title string `json:"title"`
Description string `json:"desc"`
Text string `json:"text"`
Timestamp int64 `json:"ts"`
}
var (
bucketIndex = []byte("index")
bucketGate = []byte("gate")
bucketSiteGate = []byte("site_gate")
)
// DB wraps a bbolt database and exposes typed access methods.
// bbolt handles its own locking internally.
type DB struct {
db *bolt.DB
}
// Open creates or opens the bbolt database at the given directory path.
func Open(dir string) (*DB, error) {
if err := os.MkdirAll(dir, 0o755); err != nil {
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
}
path := filepath.Join(dir, "sese.db")
db, err := bolt.Open(path, 0o600, nil)
if err != nil {
return nil, fmt.Errorf("storage.Open bolt: %w", err)
}
// Ensure buckets exist
err = db.Update(func(tx *bolt.Tx) error {
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
return err
}
}
return nil
})
if err != nil {
return nil, fmt.Errorf("storage.Open create buckets: %w", err)
}
return &DB{db: db}, nil
}
// Close closes the underlying bbolt database.
func (d *DB) Close() error {
return d.db.Close()
}
// ---- helpers ----
func compress(data []byte) ([]byte, error) {
buf := make([]byte, 0, len(data))
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6)
if _, err := w.Write(data); err != nil {
return nil, err
}
if err := w.Close(); err != nil {
return nil, err
}
return buf, nil
}
func decompress(data []byte) ([]byte, error) {
r := brotli.NewReader(
(*byteReader)(&data),
)
out := make([]byte, 0, len(data)*3)
tmp := make([]byte, 4096)
for {
n, err := r.Read(tmp)
out = append(out, tmp[:n]...)
if err != nil {
if err == io.EOF {
break
}
return out, err
}
}
return out, nil
}
// appendWriter implements io.Writer on top of a *[]byte.
type appendWriter []byte
func (a *appendWriter) Write(p []byte) (int, error) {
*a = append(*a, p...)
return len(p), nil
}
// byteReader wraps []byte as io.Reader.
type byteReader []byte
func (b *byteReader) Read(p []byte) (int, error) {
if len(*b) == 0 {
return 0, io.EOF
}
n := copy(p, *b)
*b = (*b)[n:]
return n, nil
}
func marshalCompress(v any) ([]byte, error) {
raw, err := json.Marshal(v)
if err != nil {
return nil, err
}
return compress(raw)
}
func decompressUnmarshal(data []byte, v any) error {
raw, err := decompress(data)
if err != nil {
return err
}
return json.Unmarshal(raw, v)
}
// ---- Index (inverted index) ----
// GetIndex retrieves all IndexEntry values for a keyword.
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
var entries []IndexEntry
err := d.db.View(func(tx *bolt.Tx) error {
b := tx.Bucket(bucketIndex)
v := b.Get([]byte(keyword))
if v == nil {
return nil
}
return decompressUnmarshal(v, &entries)
})
return entries, err
}
// SetIndex overwrites the IndexEntry list for a keyword.
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
data, err := marshalCompress(entries)
if err != nil {
return err
}
return d.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(bucketIndex).Put([]byte(keyword), data)
})
}
// BatchSetIndex writes multiple keyword→entries pairs in one transaction.
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
return d.db.Update(func(tx *bolt.Tx) error {
b := tx.Bucket(bucketIndex)
for keyword, entries := range batch {
data, err := marshalCompress(entries)
if err != nil {
return err
}
if err := b.Put([]byte(keyword), data); err != nil {
return err
}
}
return nil
})
}
// ForEachIndex iterates over all index entries. fn receives keyword and entries.
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
return d.db.View(func(tx *bolt.Tx) error {
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
var entries []IndexEntry
if err := decompressUnmarshal(v, &entries); err != nil {
return nil // skip corrupted entries
}
return fn(string(k), entries)
})
})
}
// ---- Gate (URL snippet cache) ----
// GetSnippet retrieves the cached snippet for a URL.
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
var entry SnippetEntry
err := d.db.View(func(tx *bolt.Tx) error {
v := tx.Bucket(bucketGate).Get([]byte(url))
if v == nil {
return fmt.Errorf("not found")
}
return decompressUnmarshal(v, &entry)
})
if err != nil {
return nil, err
}
return &entry, nil
}
// SetSnippet stores a cached snippet for a URL.
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
data, err := marshalCompress(entry)
if err != nil {
return err
}
return d.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(bucketGate).Put([]byte(url), data)
})
}
// ---- SiteGate (site metadata) ----
// SiteInfo mirrors the Python 网站 dataclass.
type SiteInfo struct {
VisitCount int `json:"visit_count"`
LastVisitTime int64 `json:"last_visit_time"`
Fingerprint any `json:"fingerprint,omitempty"`
SuccessRate *float64 `json:"success_rate,omitempty"`
HTMLStructure string `json:"html_structure,omitempty"`
IPs []string `json:"ips,omitempty"`
Quality *float64 `json:"quality,omitempty"`
HTTPSAvailable *bool `json:"https_available,omitempty"`
Keywords []string `json:"keywords,omitempty"`
OutLinks []string `json:"out_links,omitempty"`
Languages map[string]float64 `json:"languages,omitempty"`
Redirects map[string]string `json:"redirects,omitempty"`
ServerTypes []string `json:"server_types,omitempty"`
}
// GetSiteInfo retrieves metadata for a hostname.
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
var info SiteInfo
err := d.db.View(func(tx *bolt.Tx) error {
v := tx.Bucket(bucketSiteGate).Get([]byte(host))
if v == nil {
return fmt.Errorf("not found")
}
return decompressUnmarshal(v, &info)
})
if err != nil {
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
}
if info.Languages == nil {
info.Languages = make(map[string]float64)
}
if info.Redirects == nil {
info.Redirects = make(map[string]string)
}
return &info, nil
}
// SetSiteInfo stores metadata for a hostname.
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
data, err := marshalCompress(info)
if err != nil {
return err
}
return d.db.Update(func(tx *bolt.Tx) error {
return tx.Bucket(bucketSiteGate).Put([]byte(host), data)
})
}
// ForEachSite iterates over all site metadata entries.
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
return d.db.View(func(tx *bolt.Tx) error {
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
var info SiteInfo
if err := decompressUnmarshal(v, &info); err != nil {
return nil
}
return fn(string(k), &info)
})
})
}