Files
sese-engine-go/analyzer/analyzer.go
T

251 lines
6.1 KiB
Go

// Package analyzer provides keyword extraction and language detection.
//
// Keyword extraction uses gojieba for Chinese segmentation and simple token
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
package analyzer
import (
"encoding/json"
"math"
"os"
"strings"
"sync"
"unicode"
"github.com/pemistahl/lingua-go"
"github.com/yanyiwu/gojieba"
)
// Keyword holds a (word, weight) pair.
type Keyword struct {
Word string `json:"word"`
Weight float32 `json:"weight"`
}
// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
type Analyzer struct {
jieba *gojieba.Jieba
detector lingua.LanguageDetector
stopWords map[string]bool
mu sync.Mutex // gojieba is not goroutine-safe
}
// New creates an Analyzer.
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
j := gojieba.NewJieba()
// Build a lingua detector that covers the languages we care about.
// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
detector := lingua.NewLanguageDetectorBuilder().
FromAllLanguages().
WithMinimumRelativeDistance(0.15).
Build()
stopWords := loadStopWords(stopWordsPath)
return &Analyzer{
jieba: j,
detector: detector,
stopWords: stopWords,
}, nil
}
// Close releases resources held by the analyzer.
func (a *Analyzer) Close() {
a.jieba.Free()
}
// loadStopWords reads a JSON array of stop-word strings.
func loadStopWords(path string) map[string]bool {
if path == "" {
return map[string]bool{}
}
f, err := os.Open(path)
if err != nil {
return map[string]bool{}
}
defer f.Close()
var words []string
if err := json.NewDecoder(f).Decode(&words); err != nil {
return map[string]bool{}
}
m := make(map[string]bool, len(words))
for _, w := range words {
m[strings.ToLower(w)] = true
}
return m
}
// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
if len(s) > 10000 {
s = s[:10000]
}
// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
s = strings.ToValidUTF8(s, "")
var result []string
for _, part := range strings.Fields(s) {
if isASCIIAlnum(part) {
result = append(result, part)
} else {
a.mu.Lock()
var tokens []string
if searchMode {
tokens = a.jieba.CutForSearch(part, true)
} else {
tokens = a.jieba.Cut(part, true)
}
a.mu.Unlock()
result = append(result, tokens...)
}
}
return result
}
// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
func Normalize(s string) string {
var b strings.Builder
for _, r := range s {
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
if r >= 'A' && r <= 'Z' {
b.WriteRune(unicode.ToLower(r))
} else {
b.WriteRune(r)
}
}
}
return b.String()
}
// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
tokens := a.Tokenize(text, false)
d := make(map[string]float32)
n := math.Max(8, float64(len(tokens)))
counts := make(map[string]int)
for _, t := range tokens {
t = Normalize(t)
if t == "" || a.stopWords[t] || len(t) > 32 {
continue
}
counts[t]++
}
for k, v := range counts {
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
}
return d
}
// Analyze extracts weighted keywords from title, description, and body text.
// Returns a slice sorted by weight descending.
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
maps := []map[string]float32{
a.weightedTokens(title, 1.0),
a.weightedTokens(description, 0.5),
a.weightedTokens(text, 1.0),
}
combined := make(map[string]float32)
for _, m := range maps {
for k := range m {
combined[k] = 0
}
}
for k := range combined {
for _, m := range maps {
combined[k] += m[k]
}
}
result := make([]Keyword, 0, len(combined))
for k, v := range combined {
result = append(result, Keyword{Word: k, Weight: v})
}
sortKeywords(result)
return result
}
// Segment returns search-mode tokens for a query string.
func (a *Analyzer) Segment(query string, searchMode bool) []string {
tokens := a.Tokenize(query, searchMode)
var result []string
for _, t := range tokens {
t = Normalize(t)
if t == "" || a.stopWords[t] || len(t) > 32 {
continue
}
result = append(result, t)
}
return result
}
// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
// Returns "" for unknown or unsupported languages.
var linguaToISO639 = map[lingua.Language]string{
lingua.Chinese: "zh",
lingua.English: "en",
lingua.Japanese: "ja",
lingua.Korean: "ko",
lingua.French: "fr",
lingua.German: "de",
lingua.Spanish: "es",
lingua.Portuguese: "pt",
lingua.Italian: "it",
lingua.Russian: "ru",
lingua.Arabic: "ar",
lingua.Hindi: "hi",
lingua.Dutch: "nl",
lingua.Polish: "pl",
lingua.Swedish: "sv",
lingua.Turkish: "tr",
lingua.Vietnamese: "vi",
lingua.Thai: "th",
lingua.Indonesian: "id",
lingua.Malay: "ms",
}
// DetectLanguage returns the ISO 639-1 language code for the text, or "".
func (a *Analyzer) DetectLanguage(text string) string {
text = strings.ReplaceAll(text, "\n", " ")
if len(text) > 2000 {
text = text[:2000]
}
if text == "" {
return ""
}
lang, exists := a.detector.DetectLanguageOf(text)
if !exists {
return ""
}
if code, ok := linguaToISO639[lang]; ok {
return code
}
return ""
}
// ---- sorting ----
func sortKeywords(kws []Keyword) {
for i := 1; i < len(kws); i++ {
key := kws[i]
j := i - 1
for j >= 0 && kws[j].Weight < key.Weight {
kws[j+1] = kws[j]
j--
}
kws[j+1] = key
}
}
func isASCIIAlnum(s string) bool {
for _, r := range s {
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
return false
}
}
return len(s) > 0
}