Signed-off-by: 吴文峰 <kevin@lmve.net>

2026-04-08 17:29:39 +08:00
commit 6c2f5ad978
15 changed files with 3651 additions and 0 deletions
@@ -0,0 +1,250 @@
+// Package analyzer provides keyword extraction and language detection.
+//
+// Keyword extraction uses gojieba for Chinese segmentation and simple token
+// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
+package analyzer
+
+import (
+	"encoding/json"
+	"math"
+	"os"
+	"strings"
+	"sync"
+	"unicode"
+
+	"github.com/pemistahl/lingua-go"
+	"github.com/yanyiwu/gojieba"
+)
+
+// Keyword holds a (word, weight) pair.
+type Keyword struct {
+	Word   string  `json:"word"`
+	Weight float32 `json:"weight"`
+}
+
+// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
+type Analyzer struct {
+	jieba     *gojieba.Jieba
+	detector  lingua.LanguageDetector
+	stopWords map[string]bool
+	mu        sync.Mutex // gojieba is not goroutine-safe
+}
+
+// New creates an Analyzer.
+// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
+// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
+func New(modelPath, stopWordsPath string) (*Analyzer, error) {
+	j := gojieba.NewJieba()
+
+	// Build a lingua detector that covers the languages we care about.
+	// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
+	detector := lingua.NewLanguageDetectorBuilder().
+		FromAllLanguages().
+		WithMinimumRelativeDistance(0.15).
+		Build()
+
+	stopWords := loadStopWords(stopWordsPath)
+
+	return &Analyzer{
+		jieba:     j,
+		detector:  detector,
+		stopWords: stopWords,
+	}, nil
+}
+
+// Close releases resources held by the analyzer.
+func (a *Analyzer) Close() {
+	a.jieba.Free()
+}
+
+// loadStopWords reads a JSON array of stop-word strings.
+func loadStopWords(path string) map[string]bool {
+	if path == "" {
+		return map[string]bool{}
+	}
+	f, err := os.Open(path)
+	if err != nil {
+		return map[string]bool{}
+	}
+	defer f.Close()
+	var words []string
+	if err := json.NewDecoder(f).Decode(&words); err != nil {
+		return map[string]bool{}
+	}
+	m := make(map[string]bool, len(words))
+	for _, w := range words {
+		m[strings.ToLower(w)] = true
+	}
+	return m
+}
+
+// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
+func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
+	if len(s) > 10000 {
+		s = s[:10000]
+	}
+	// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
+	s = strings.ToValidUTF8(s, "")
+	var result []string
+	for _, part := range strings.Fields(s) {
+		if isASCIIAlnum(part) {
+			result = append(result, part)
+		} else {
+			a.mu.Lock()
+			var tokens []string
+			if searchMode {
+				tokens = a.jieba.CutForSearch(part, true)
+			} else {
+				tokens = a.jieba.Cut(part, true)
+			}
+			a.mu.Unlock()
+			result = append(result, tokens...)
+		}
+	}
+	return result
+}
+
+// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
+func Normalize(s string) string {
+	var b strings.Builder
+	for _, r := range s {
+		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
+			if r >= 'A' && r <= 'Z' {
+				b.WriteRune(unicode.ToLower(r))
+			} else {
+				b.WriteRune(r)
+			}
+		}
+	}
+	return b.String()
+}
+
+// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
+func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
+	tokens := a.Tokenize(text, false)
+	d := make(map[string]float32)
+	n := math.Max(8, float64(len(tokens)))
+	counts := make(map[string]int)
+	for _, t := range tokens {
+		t = Normalize(t)
+		if t == "" || a.stopWords[t] || len(t) > 32 {
+			continue
+		}
+		counts[t]++
+	}
+	for k, v := range counts {
+		d[k] = float32(math.Min(0.2, float64(v)/n)) * w
+	}
+	return d
+}
+
+// Analyze extracts weighted keywords from title, description, and body text.
+// Returns a slice sorted by weight descending.
+func (a *Analyzer) Analyze(title, description, text string) []Keyword {
+	maps := []map[string]float32{
+		a.weightedTokens(title, 1.0),
+		a.weightedTokens(description, 0.5),
+		a.weightedTokens(text, 1.0),
+	}
+
+	combined := make(map[string]float32)
+	for _, m := range maps {
+		for k := range m {
+			combined[k] = 0
+		}
+	}
+	for k := range combined {
+		for _, m := range maps {
+			combined[k] += m[k]
+		}
+	}
+
+	result := make([]Keyword, 0, len(combined))
+	for k, v := range combined {
+		result = append(result, Keyword{Word: k, Weight: v})
+	}
+
+	sortKeywords(result)
+	return result
+}
+
+// Segment returns search-mode tokens for a query string.
+func (a *Analyzer) Segment(query string, searchMode bool) []string {
+	tokens := a.Tokenize(query, searchMode)
+	var result []string
+	for _, t := range tokens {
+		t = Normalize(t)
+		if t == "" || a.stopWords[t] || len(t) > 32 {
+			continue
+		}
+		result = append(result, t)
+	}
+	return result
+}
+
+// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
+// Returns "" for unknown or unsupported languages.
+var linguaToISO639 = map[lingua.Language]string{
+	lingua.Chinese:    "zh",
+	lingua.English:    "en",
+	lingua.Japanese:   "ja",
+	lingua.Korean:     "ko",
+	lingua.French:     "fr",
+	lingua.German:     "de",
+	lingua.Spanish:    "es",
+	lingua.Portuguese: "pt",
+	lingua.Italian:    "it",
+	lingua.Russian:    "ru",
+	lingua.Arabic:     "ar",
+	lingua.Hindi:      "hi",
+	lingua.Dutch:      "nl",
+	lingua.Polish:     "pl",
+	lingua.Swedish:    "sv",
+	lingua.Turkish:    "tr",
+	lingua.Vietnamese: "vi",
+	lingua.Thai:       "th",
+	lingua.Indonesian: "id",
+	lingua.Malay:      "ms",
+}
+
+// DetectLanguage returns the ISO 639-1 language code for the text, or "".
+func (a *Analyzer) DetectLanguage(text string) string {
+	text = strings.ReplaceAll(text, "\n", " ")
+	if len(text) > 2000 {
+		text = text[:2000]
+	}
+	if text == "" {
+		return ""
+	}
+	lang, exists := a.detector.DetectLanguageOf(text)
+	if !exists {
+		return ""
+	}
+	if code, ok := linguaToISO639[lang]; ok {
+		return code
+	}
+	return ""
+}
+
+// ---- sorting ----
+
+func sortKeywords(kws []Keyword) {
+	for i := 1; i < len(kws); i++ {
+		key := kws[i]
+		j := i - 1
+		for j >= 0 && kws[j].Weight < key.Weight {
+			kws[j+1] = kws[j]
+			j--
+		}
+		kws[j+1] = key
+	}
+}
+
+func isASCIIAlnum(s string) bool {
+	for _, r := range s {
+		if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
+			return false
+		}
+	}
+	return len(s) > 0
+}