sese-engine-go/analyzer/analyzer.go

// Package analyzer provides keyword extraction and language detection.
//
// Keyword extraction uses gojieba for Chinese segmentation and simple token
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
package analyzer

import (
	"encoding/json"
	"math"
	"os"
	"strings"
	"sync"
	"unicode"

	"github.com/pemistahl/lingua-go"
	"github.com/yanyiwu/gojieba"
)

// Keyword holds a (word, weight) pair.
type Keyword struct {
	Word   string  `json:"word"`
	Weight float32 `json:"weight"`
}

// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
type Analyzer struct {
	jieba     *gojieba.Jieba
	detector  lingua.LanguageDetector
	stopWords map[string]bool
	mu        sync.Mutex // gojieba is not goroutine-safe
}

// New creates an Analyzer.
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
	j := gojieba.NewJieba()

	// Build a lingua detector that covers the languages we care about.
	// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
	detector := lingua.NewLanguageDetectorBuilder().
		FromAllLanguages().
		WithMinimumRelativeDistance(0.15).
		Build()

	stopWords := loadStopWords(stopWordsPath)

	return &Analyzer{
		jieba:     j,
		detector:  detector,
		stopWords: stopWords,
	}, nil
}

// Close releases resources held by the analyzer.
func (a *Analyzer) Close() {
	a.jieba.Free()
}

// loadStopWords reads a JSON array of stop-word strings.
func loadStopWords(path string) map[string]bool {
	if path == "" {
		return map[string]bool{}
	}
	f, err := os.Open(path)
	if err != nil {
		return map[string]bool{}
	}
	defer f.Close()
	var words []string
	if err := json.NewDecoder(f).Decode(&words); err != nil {
		return map[string]bool{}
	}
	m := make(map[string]bool, len(words))
	for _, w := range words {
		m[strings.ToLower(w)] = true
	}
	return m
}

// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
	if len(s) > 10000 {
		s = s[:10000]
	}
	// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
	s = strings.ToValidUTF8(s, "")
	var result []string
	for _, part := range strings.Fields(s) {
		if isASCIIAlnum(part) {
			result = append(result, part)
		} else {
			a.mu.Lock()
			var tokens []string
			if searchMode {
				tokens = a.jieba.CutForSearch(part, true)
			} else {
				tokens = a.jieba.Cut(part, true)
			}
			a.mu.Unlock()
			result = append(result, tokens...)
		}
	}
	return result
}

// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
func Normalize(s string) string {
	var b strings.Builder
	for _, r := range s {
		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
			if r >= 'A' && r <= 'Z' {
				b.WriteRune(unicode.ToLower(r))
			} else {
				b.WriteRune(r)
			}
		}
	}
	return b.String()
}

// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
	tokens := a.Tokenize(text, false)
	d := make(map[string]float32)
	n := math.Max(8, float64(len(tokens)))
	counts := make(map[string]int)
	for _, t := range tokens {
		t = Normalize(t)
		if t == "" || a.stopWords[t] || len(t) > 32 {
			continue
		}
		counts[t]++
	}
	for k, v := range counts {
		d[k] = float32(math.Min(0.2, float64(v)/n)) * w
	}
	return d
}

// Analyze extracts weighted keywords from title, description, and body text.
// Returns a slice sorted by weight descending.
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
	maps := []map[string]float32{
		a.weightedTokens(title, 1.0),
		a.weightedTokens(description, 0.5),
		a.weightedTokens(text, 1.0),
	}

	combined := make(map[string]float32)
	for _, m := range maps {
		for k := range m {
			combined[k] = 0
		}
	}
	for k := range combined {
		for _, m := range maps {
			combined[k] += m[k]
		}
	}

	result := make([]Keyword, 0, len(combined))
	for k, v := range combined {
		result = append(result, Keyword{Word: k, Weight: v})
	}

	sortKeywords(result)
	return result
}

// Segment returns search-mode tokens for a query string.
func (a *Analyzer) Segment(query string, searchMode bool) []string {
	tokens := a.Tokenize(query, searchMode)
	var result []string
	for _, t := range tokens {
		t = Normalize(t)
		if t == "" || a.stopWords[t] || len(t) > 32 {
			continue
		}
		result = append(result, t)
	}
	return result
}

// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
// Returns "" for unknown or unsupported languages.
var linguaToISO639 = map[lingua.Language]string{
	lingua.Chinese:    "zh",
	lingua.English:    "en",
	lingua.Japanese:   "ja",
	lingua.Korean:     "ko",
	lingua.French:     "fr",
	lingua.German:     "de",
	lingua.Spanish:    "es",
	lingua.Portuguese: "pt",
	lingua.Italian:    "it",
	lingua.Russian:    "ru",
	lingua.Arabic:     "ar",
	lingua.Hindi:      "hi",
	lingua.Dutch:      "nl",
	lingua.Polish:     "pl",
	lingua.Swedish:    "sv",
	lingua.Turkish:    "tr",
	lingua.Vietnamese: "vi",
	lingua.Thai:       "th",
	lingua.Indonesian: "id",
	lingua.Malay:      "ms",
}

// DetectLanguage returns the ISO 639-1 language code for the text, or "".
func (a *Analyzer) DetectLanguage(text string) string {
	text = strings.ReplaceAll(text, "\n", " ")
	if len(text) > 2000 {
		text = text[:2000]
	}
	if text == "" {
		return ""
	}
	lang, exists := a.detector.DetectLanguageOf(text)
	if !exists {
		return ""
	}
	if code, ok := linguaToISO639[lang]; ok {
		return code
	}
	return ""
}

// ---- sorting ----

func sortKeywords(kws []Keyword) {
	for i := 1; i < len(kws); i++ {
		key := kws[i]
		j := i - 1
		for j >= 0 && kws[j].Weight < key.Weight {
			kws[j+1] = kws[j]
			j--
		}
		kws[j+1] = key
	}
}

func isASCIIAlnum(s string) bool {
	for _, r := range s {
		if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
			return false
		}
	}
	return len(s) > 0
}