207 lines
4.5 KiB
Go
207 lines
4.5 KiB
Go
// Package info loads and serves auxiliary data: backlink scores, adjustment
|
|
// table, and blocked query words.
|
|
package info
|
|
|
|
import (
|
|
"encoding/json"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
)
|
|
|
|
// Service loads the prosperity map, adjustment table, and blocked words.
|
|
type Service struct {
|
|
mu sync.RWMutex
|
|
prosperMap map[string]float64 // normalised backlink scores
|
|
adjustTable map[string]float64 // per-domain manual weight adjustments
|
|
blockedWords map[string]bool
|
|
storagePath string
|
|
}
|
|
|
|
// New creates and loads the info service from storagePath.
|
|
func New(storagePath string) *Service {
|
|
s := &Service{storagePath: storagePath}
|
|
s.Reload()
|
|
return s
|
|
}
|
|
|
|
// Reload re-reads all data files from disk.
|
|
func (s *Service) Reload() {
|
|
s.mu.Lock()
|
|
defer s.mu.Unlock()
|
|
s.prosperMap = loadProsperMap(s.storagePath)
|
|
s.adjustTable = loadAdjustTable()
|
|
s.blockedWords = loadBlockedWords()
|
|
}
|
|
|
|
// Prosper returns the backlink score for a URL (sum of its path components).
|
|
func (s *Service) Prosper(rawURL string) float64 {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
return prosperFor(rawURL, s.prosperMap)
|
|
}
|
|
|
|
// ProsperMap returns the full prosperity map (read-only snapshot).
|
|
func (s *Service) ProsperMap() map[string]float64 {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
out := make(map[string]float64, len(s.prosperMap))
|
|
for k, v := range s.prosperMap {
|
|
out[k] = v
|
|
}
|
|
return out
|
|
}
|
|
|
|
// Adjust returns the manual weight multiplier for a hostname (default 1.0).
|
|
func (s *Service) Adjust(host string) float64 {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
if v, ok := s.adjustTable[host]; ok {
|
|
return v
|
|
}
|
|
return 1.0
|
|
}
|
|
|
|
// IsBlocked returns true if the word is in the blocked list.
|
|
func (s *Service) IsBlocked(word string) bool {
|
|
s.mu.RLock()
|
|
defer s.mu.RUnlock()
|
|
return s.blockedWords[word]
|
|
}
|
|
|
|
// ---- loaders ----
|
|
|
|
const backlinkBaseline = 200000.0
|
|
|
|
func loadProsperMap(storagePath string) map[string]float64 {
|
|
path := filepath.Join(storagePath, "prosper.json")
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return map[string]float64{}
|
|
}
|
|
defer f.Close()
|
|
var raw map[string]float64
|
|
if err := json.NewDecoder(f).Decode(&raw); err != nil {
|
|
return map[string]float64{}
|
|
}
|
|
return normalise(raw)
|
|
}
|
|
|
|
func normalise(d map[string]float64) map[string]float64 {
|
|
total := 0.0
|
|
for k, v := range d {
|
|
if !strings.Contains(k, "/") {
|
|
total += v
|
|
}
|
|
}
|
|
if total == 0 {
|
|
return d
|
|
}
|
|
factor := backlinkBaseline / total
|
|
out := make(map[string]float64, len(d))
|
|
for k, v := range d {
|
|
out[k] = v * factor
|
|
}
|
|
// Propagate max score up the domain tree
|
|
for k, v := range out {
|
|
now := k
|
|
for {
|
|
idx := strings.Index(now, ".")
|
|
if idx < 0 {
|
|
break
|
|
}
|
|
now = now[idx+1:]
|
|
if cur, ok := out[now]; ok && cur < v {
|
|
out[now] = v
|
|
} else if !ok {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func loadAdjustTable() map[string]float64 {
|
|
// Try loading from data/adjust.json — fallback if absent
|
|
f, err := os.Open(filepath.Join("data", "adjust.json"))
|
|
if err != nil {
|
|
return map[string]float64{}
|
|
}
|
|
defer f.Close()
|
|
var m map[string]float64
|
|
json.NewDecoder(f).Decode(&m)
|
|
return m
|
|
}
|
|
|
|
func loadBlockedWords() map[string]bool {
|
|
f, err := os.Open(filepath.Join("data", "blocked_words.json"))
|
|
if err != nil {
|
|
return map[string]bool{}
|
|
}
|
|
defer f.Close()
|
|
var words []string
|
|
json.NewDecoder(f).Decode(&words)
|
|
m := make(map[string]bool, len(words))
|
|
for _, w := range words {
|
|
m[w] = true
|
|
}
|
|
return m
|
|
}
|
|
|
|
// prosperFor computes the prosperity score for a URL by decomposing it.
|
|
func prosperFor(rawURL string, pm map[string]float64) float64 {
|
|
segments := decomposeURL(rawURL)
|
|
s := 0.0
|
|
for _, seg := range segments {
|
|
t, ok := pm[seg]
|
|
if !ok {
|
|
t = 0
|
|
}
|
|
l := 0.0
|
|
if t > 0 {
|
|
l = math.Log2(2+t*2) - 1
|
|
}
|
|
if s == 0 {
|
|
if l == 0 {
|
|
return 0
|
|
}
|
|
s = l
|
|
} else {
|
|
s = l + math.Log((s-l)/2+1)
|
|
}
|
|
}
|
|
if s > 0 {
|
|
return 0.1 + s
|
|
}
|
|
return 0
|
|
}
|
|
|
|
// decomposeURL yields "domain.tld", "domain.tld/path", "domain.tld/path/sub", ...
|
|
func decomposeURL(rawURL string) []string {
|
|
u := strings.ToLower(rawURL)
|
|
if strings.HasPrefix(u, "https://") {
|
|
u = u[8:]
|
|
} else if strings.HasPrefix(u, "http://") {
|
|
u = u[7:]
|
|
} else {
|
|
return nil
|
|
}
|
|
u = strings.ReplaceAll(u, "?", "/")
|
|
u = strings.ReplaceAll(u, "#", "/")
|
|
u = strings.TrimRight(u, "/")
|
|
if u == "" || u[0] == '/' || u[0] == '%' || u[0] == ' ' {
|
|
return nil
|
|
}
|
|
parts := strings.Split(u, "/")
|
|
var out []string
|
|
current := parts[0]
|
|
out = append(out, current)
|
|
for _, p := range parts[1:] {
|
|
current = current + "/" + p
|
|
out = append(out, current)
|
|
}
|
|
return out
|
|
}
|