301 lines
7.9 KiB
Go
301 lines
7.9 KiB
Go
// Package storage provides the persistent index and site-info storage backed by bbolt.
|
|
//
|
|
// Index space → a single bbolt bucket "index" where key = keyword (string),
|
|
// value = brotli-compressed JSON array of [weight, url] pairs.
|
|
//
|
|
// Gate (门) → a bbolt bucket "gate" where key = URL (string),
|
|
// value = brotli-compressed JSON array [title, desc, text, timestamp].
|
|
//
|
|
// SiteGate (网站之门) → a bbolt bucket "site_gate" where key = hostname (string),
|
|
// value = brotli-compressed JSON of SiteInfo struct.
|
|
//
|
|
// The Python version used a custom hash-bucket scheme; here bbolt handles it natively.
|
|
package storage
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
|
|
"github.com/andybalholm/brotli"
|
|
bolt "go.etcd.io/bbolt"
|
|
)
|
|
|
|
// IndexEntry is a single entry in the inverted index.
|
|
type IndexEntry struct {
|
|
Weight float32 `json:"w"`
|
|
URL string `json:"u"`
|
|
}
|
|
|
|
// SnippetEntry is cached snippet data for a URL.
|
|
type SnippetEntry struct {
|
|
Title string `json:"title"`
|
|
Description string `json:"desc"`
|
|
Text string `json:"text"`
|
|
Timestamp int64 `json:"ts"`
|
|
}
|
|
|
|
var (
|
|
bucketIndex = []byte("index")
|
|
bucketGate = []byte("gate")
|
|
bucketSiteGate = []byte("site_gate")
|
|
)
|
|
|
|
// DB wraps a bbolt database and exposes typed access methods.
|
|
// bbolt handles its own locking internally.
|
|
type DB struct {
|
|
db *bolt.DB
|
|
}
|
|
|
|
// Open creates or opens the bbolt database at the given directory path.
|
|
func Open(dir string) (*DB, error) {
|
|
if err := os.MkdirAll(dir, 0o755); err != nil {
|
|
return nil, fmt.Errorf("storage.Open mkdir: %w", err)
|
|
}
|
|
path := filepath.Join(dir, "sese.db")
|
|
db, err := bolt.Open(path, 0o600, nil)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("storage.Open bolt: %w", err)
|
|
}
|
|
// Ensure buckets exist
|
|
err = db.Update(func(tx *bolt.Tx) error {
|
|
for _, b := range [][]byte{bucketIndex, bucketGate, bucketSiteGate} {
|
|
if _, err := tx.CreateBucketIfNotExists(b); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("storage.Open create buckets: %w", err)
|
|
}
|
|
return &DB{db: db}, nil
|
|
}
|
|
|
|
// Close closes the underlying bbolt database.
|
|
func (d *DB) Close() error {
|
|
return d.db.Close()
|
|
}
|
|
|
|
// ---- helpers ----
|
|
|
|
func compress(data []byte) ([]byte, error) {
|
|
buf := make([]byte, 0, len(data))
|
|
w := brotli.NewWriterLevel((*appendWriter)(&buf), 6)
|
|
if _, err := w.Write(data); err != nil {
|
|
return nil, err
|
|
}
|
|
if err := w.Close(); err != nil {
|
|
return nil, err
|
|
}
|
|
return buf, nil
|
|
}
|
|
|
|
func decompress(data []byte) ([]byte, error) {
|
|
r := brotli.NewReader(
|
|
(*byteReader)(&data),
|
|
)
|
|
out := make([]byte, 0, len(data)*3)
|
|
tmp := make([]byte, 4096)
|
|
for {
|
|
n, err := r.Read(tmp)
|
|
out = append(out, tmp[:n]...)
|
|
if err != nil {
|
|
if err == io.EOF {
|
|
break
|
|
}
|
|
return out, err
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
// appendWriter implements io.Writer on top of a *[]byte.
|
|
type appendWriter []byte
|
|
|
|
func (a *appendWriter) Write(p []byte) (int, error) {
|
|
*a = append(*a, p...)
|
|
return len(p), nil
|
|
}
|
|
|
|
// byteReader wraps []byte as io.Reader.
|
|
type byteReader []byte
|
|
|
|
func (b *byteReader) Read(p []byte) (int, error) {
|
|
if len(*b) == 0 {
|
|
return 0, io.EOF
|
|
}
|
|
n := copy(p, *b)
|
|
*b = (*b)[n:]
|
|
return n, nil
|
|
}
|
|
|
|
func marshalCompress(v any) ([]byte, error) {
|
|
raw, err := json.Marshal(v)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return compress(raw)
|
|
}
|
|
|
|
func decompressUnmarshal(data []byte, v any) error {
|
|
raw, err := decompress(data)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return json.Unmarshal(raw, v)
|
|
}
|
|
|
|
// ---- Index (inverted index) ----
|
|
|
|
// GetIndex retrieves all IndexEntry values for a keyword.
|
|
func (d *DB) GetIndex(keyword string) ([]IndexEntry, error) {
|
|
var entries []IndexEntry
|
|
err := d.db.View(func(tx *bolt.Tx) error {
|
|
b := tx.Bucket(bucketIndex)
|
|
v := b.Get([]byte(keyword))
|
|
if v == nil {
|
|
return nil
|
|
}
|
|
return decompressUnmarshal(v, &entries)
|
|
})
|
|
return entries, err
|
|
}
|
|
|
|
// SetIndex overwrites the IndexEntry list for a keyword.
|
|
func (d *DB) SetIndex(keyword string, entries []IndexEntry) error {
|
|
data, err := marshalCompress(entries)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return d.db.Update(func(tx *bolt.Tx) error {
|
|
return tx.Bucket(bucketIndex).Put([]byte(keyword), data)
|
|
})
|
|
}
|
|
|
|
// BatchSetIndex writes multiple keyword→entries pairs in one transaction.
|
|
func (d *DB) BatchSetIndex(batch map[string][]IndexEntry) error {
|
|
return d.db.Update(func(tx *bolt.Tx) error {
|
|
b := tx.Bucket(bucketIndex)
|
|
for keyword, entries := range batch {
|
|
data, err := marshalCompress(entries)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := b.Put([]byte(keyword), data); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
}
|
|
|
|
// ForEachIndex iterates over all index entries. fn receives keyword and entries.
|
|
func (d *DB) ForEachIndex(fn func(keyword string, entries []IndexEntry) error) error {
|
|
return d.db.View(func(tx *bolt.Tx) error {
|
|
return tx.Bucket(bucketIndex).ForEach(func(k, v []byte) error {
|
|
var entries []IndexEntry
|
|
if err := decompressUnmarshal(v, &entries); err != nil {
|
|
return nil // skip corrupted entries
|
|
}
|
|
return fn(string(k), entries)
|
|
})
|
|
})
|
|
}
|
|
|
|
// ---- Gate (URL snippet cache) ----
|
|
|
|
// GetSnippet retrieves the cached snippet for a URL.
|
|
func (d *DB) GetSnippet(url string) (*SnippetEntry, error) {
|
|
var entry SnippetEntry
|
|
err := d.db.View(func(tx *bolt.Tx) error {
|
|
v := tx.Bucket(bucketGate).Get([]byte(url))
|
|
if v == nil {
|
|
return fmt.Errorf("not found")
|
|
}
|
|
return decompressUnmarshal(v, &entry)
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return &entry, nil
|
|
}
|
|
|
|
// SetSnippet stores a cached snippet for a URL.
|
|
func (d *DB) SetSnippet(url string, entry *SnippetEntry) error {
|
|
data, err := marshalCompress(entry)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return d.db.Update(func(tx *bolt.Tx) error {
|
|
return tx.Bucket(bucketGate).Put([]byte(url), data)
|
|
})
|
|
}
|
|
|
|
// ---- SiteGate (site metadata) ----
|
|
|
|
// SiteInfo mirrors the Python 网站 dataclass.
|
|
type SiteInfo struct {
|
|
VisitCount int `json:"visit_count"`
|
|
LastVisitTime int64 `json:"last_visit_time"`
|
|
Fingerprint any `json:"fingerprint,omitempty"`
|
|
SuccessRate *float64 `json:"success_rate,omitempty"`
|
|
HTMLStructure string `json:"html_structure,omitempty"`
|
|
IPs []string `json:"ips,omitempty"`
|
|
Quality *float64 `json:"quality,omitempty"`
|
|
HTTPSAvailable *bool `json:"https_available,omitempty"`
|
|
Keywords []string `json:"keywords,omitempty"`
|
|
OutLinks []string `json:"out_links,omitempty"`
|
|
Languages map[string]float64 `json:"languages,omitempty"`
|
|
Redirects map[string]string `json:"redirects,omitempty"`
|
|
ServerTypes []string `json:"server_types,omitempty"`
|
|
}
|
|
|
|
// GetSiteInfo retrieves metadata for a hostname.
|
|
func (d *DB) GetSiteInfo(host string) (*SiteInfo, error) {
|
|
var info SiteInfo
|
|
err := d.db.View(func(tx *bolt.Tx) error {
|
|
v := tx.Bucket(bucketSiteGate).Get([]byte(host))
|
|
if v == nil {
|
|
return fmt.Errorf("not found")
|
|
}
|
|
return decompressUnmarshal(v, &info)
|
|
})
|
|
if err != nil {
|
|
return &SiteInfo{Languages: make(map[string]float64), Redirects: make(map[string]string)}, nil
|
|
}
|
|
if info.Languages == nil {
|
|
info.Languages = make(map[string]float64)
|
|
}
|
|
if info.Redirects == nil {
|
|
info.Redirects = make(map[string]string)
|
|
}
|
|
return &info, nil
|
|
}
|
|
|
|
// SetSiteInfo stores metadata for a hostname.
|
|
func (d *DB) SetSiteInfo(host string, info *SiteInfo) error {
|
|
data, err := marshalCompress(info)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return d.db.Update(func(tx *bolt.Tx) error {
|
|
return tx.Bucket(bucketSiteGate).Put([]byte(host), data)
|
|
})
|
|
}
|
|
|
|
// ForEachSite iterates over all site metadata entries.
|
|
func (d *DB) ForEachSite(fn func(host string, info *SiteInfo) error) error {
|
|
return d.db.View(func(tx *bolt.Tx) error {
|
|
return tx.Bucket(bucketSiteGate).ForEach(func(k, v []byte) error {
|
|
var info SiteInfo
|
|
if err := decompressUnmarshal(v, &info); err != nil {
|
|
return nil
|
|
}
|
|
return fn(string(k), &info)
|
|
})
|
|
})
|
|
}
|