// Package storage provides the persistent index and site-info storage backed by bbolt. // // Index space → a single bbolt bucket "index" where key = keyword (string), // value = brotli-compressed JSON array of [weight, url] pairs. // // Gate (门) → a bbolt bucket "gate" where key = URL (string), // value = brotli-compressed JSON array [title, desc, text, timestamp]. // // SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string), // value = brotli-compressed JSON of SiteInfo struct. // // The Python version used a custom hash-bucket scheme; here bbolt handles it natively. package storage import ( "encoding/json" "fmt" "io" "os" "path/filepath" "github.com/andybalholm/brotli" bolt "go.etcd.io/bbolt" ) // IndexEntry is a single entry in the inverted index. type IndexEntry struct { Weight float32 `json:"w"` URL string `json:"u"` } // SnippetEntry is cached snippet data for a URL. type SnippetEntry struct { Title string `json:"title"` Description string `json:"desc"` Text string `json:"text"` Timestamp int64 `json:"ts"` } var ( bucketIndex = []byte("index") bucketGate = []byte("gate") bucketSiteGate = []byte("site_gate") ) // DB wraps a bbolt database and exposes typed access methods. // bbolt handles its own locking internally. type DB struct { db *bolt.DB } // Open creates or opens the bbolt database at the given directory path. func Open(dir string) (*DB, error) { if err := os.MkdirAll(dir, 0o755); err != nil { return nil, fmt.Errorf("storage.Open mkdir: %w", err) } path := filepath.Join(dir, "sese.db") db, err := bolt.Open(path, 0o600, nil) if err != nil { return nil, fmt.Errorf("storage.Open bolt: %w", err) } // Ensure buckets exist err = db.Update(func(tx *bolt.Tx) error { for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} { if _, err := tx.CreateBucketIfNotExists(b); err != nil { return err } } return nil }) if err != nil { return nil, fmt.Errorf("storage.Open create buckets: %w", err) } return &DB{db: db}, nil } // Close closes the underlying bbolt database. func (d *DB) Close() error { return d.db.Close() } // ---- helpers ---- func compress(data []byte) ([]byte, error) { buf := make([]byte, 0, len(data)) w := brotli.NewWriterLevel((*appendWriter)(&buf), 6) if _, err := w.Write(data); err != nil { return nil, err } if err := w.Close(); err != nil { return nil, err } return buf, nil } func decompress(data []byte) ([]byte, error) { r := brotli.NewReader( (*byteReader)(&data), ) out := make([]byte, 0, len(data)*3) tmp := make([]byte, 4096) for { n, err := r.Read(tmp) out = append(out, tmp[:n]...) if err != nil { if err == io.EOF { break } return out, err } } return out, nil } // appendWriter implements io.Writer on top of a *[]byte. type appendWriter []byte func (a *appendWriter) Write(p []byte) (int, error) { *a = append(*a, p...) return len(p), nil } // byteReader wraps []byte as io.Reader. type byteReader []byte func (b *byteReader) Read(p []byte) (int, error) { if len(*b) == 0 { return 0, io.EOF } n := copy(p, *b) *b = (*b)[n:] return n, nil } func marshalCompress(v any) ([]byte, error) { raw, err := json.Marshal(v) if err != nil { return nil, err } return compress(raw) } func decompressUnmarshal(data []byte, v any) error { raw, err := decompress(data) if err != nil { return err } return json.Unmarshal(raw, v) } // ---- Index (inverted index) ---- // GetIndex retrieves all IndexEntry values for a keyword. func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) { var entries []IndexEntry err := d.db.View(func(tx *bolt.Tx) error { b := tx.Bucket(bucketIndex) v := b.Get([]byte(keyword)) if v == nil { return nil } return decompressUnmarshal(v, &entries) }) return entries, err } // SetIndex overwrites the IndexEntry list for a keyword. func (d *DB) SetIndex(keyword string, entries []IndexEntry) error { data, err := marshalCompress(entries) if err != nil { return err } return d.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(bucketIndex).Put([]byte(keyword), data) }) } // BatchSetIndex writes multiple keyword→entries pairs in one transaction. func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error { return d.db.Update(func(tx *bolt.Tx) error { b := tx.Bucket(bucketIndex) for keyword, entries := range batch { data, err := marshalCompress(entries) if err != nil { return err } if err := b.Put([]byte(keyword), data); err != nil { return err } } return nil }) } // ForEachIndex iterates over all index entries. fn receives keyword and entries. func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error { return d.db.View(func(tx *bolt.Tx) error { return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error { var entries []IndexEntry if err := decompressUnmarshal(v, &entries); err != nil { return nil // skip corrupted entries } return fn(string(k), entries) }) }) } // ---- Gate (URL snippet cache) ---- // GetSnippet retrieves the cached snippet for a URL. func (d *DB) GetSnippet(url string) (*SnippetEntry, error) { var entry SnippetEntry err := d.db.View(func(tx *bolt.Tx) error { v := tx.Bucket(bucketGate).Get([]byte(url)) if v == nil { return fmt.Errorf("not found") } return decompressUnmarshal(v, &entry) }) if err != nil { return nil, err } return &entry, nil } // SetSnippet stores a cached snippet for a URL. func (d *DB) SetSnippet(url string, entry *SnippetEntry) error { data, err := marshalCompress(entry) if err != nil { return err } return d.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(bucketGate).Put([]byte(url), data) }) } // ---- SiteGate (site metadata) ---- // SiteInfo mirrors the Python 网站 dataclass. type SiteInfo struct { VisitCount int `json:"visit_count"` LastVisitTime int64 `json:"last_visit_time"` Fingerprint any `json:"fingerprint,omitempty"` SuccessRate *float64 `json:"success_rate,omitempty"` HTMLStructure string `json:"html_structure,omitempty"` IPs []string `json:"ips,omitempty"` Quality *float64 `json:"quality,omitempty"` HTTPSAvailable *bool `json:"https_available,omitempty"` Keywords []string `json:"keywords,omitempty"` OutLinks []string `json:"out_links,omitempty"` Languages map[string]float64 `json:"languages,omitempty"` Redirects map[string]string `json:"redirects,omitempty"` ServerTypes []string `json:"server_types,omitempty"` } // GetSiteInfo retrieves metadata for a hostname. func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) { var info SiteInfo err := d.db.View(func(tx *bolt.Tx) error { v := tx.Bucket(bucketSiteGate).Get([]byte(host)) if v == nil { return fmt.Errorf("not found") } return decompressUnmarshal(v, &info) }) if err != nil { return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil } if info.Languages == nil { info.Languages = make(map[string]float64) } if info.Redirects == nil { info.Redirects = make(map[string]string) } return &info, nil } // SetSiteInfo stores metadata for a hostname. func (d *DB) SetSiteInfo(host string, info *SiteInfo) error { data, err := marshalCompress(info) if err != nil { return err } return d.db.Update(func(tx *bolt.Tx) error { return tx.Bucket(bucketSiteGate).Put([]byte(host), data) }) } // ForEachSite iterates over all site metadata entries. func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error { return d.db.View(func(tx *bolt.Tx) error { return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error { var info SiteInfo if err := decompressUnmarshal(v, &info); err != nil { return nil } return fn(string(k), &info) }) }) }