// Package analyzer provides keyword extraction and language detection. // // Keyword extraction uses gojieba for Chinese segmentation and simple token // splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo). package analyzer import ( "encoding/json" "math" "os" "strings" "sync" "unicode" "github.com/pemistahl/lingua-go" "github.com/yanyiwu/gojieba" ) // Keyword holds a (word, weight) pair. type Keyword struct { Word string `json:"word"` Weight float32 `json:"weight"` } // Analyzer wraps jieba and lingua into a thread-safe analysis pipeline. type Analyzer struct { jieba *gojieba.Jieba detector lingua.LanguageDetector stopWords map[string]bool mu sync.Mutex // gojieba is not goroutine-safe } // New creates an Analyzer. // stopWordsPath is the JSON file with punctuation/stop words (may be empty string). // modelPath is ignored (kept for API compatibility; lingua-go uses built-in data). func New(modelPath, stopWordsPath string) (*Analyzer, error) { j := gojieba.NewJieba() // Build a lingua detector that covers the languages we care about. // AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc. detector := lingua.NewLanguageDetectorBuilder(). FromAllLanguages(). WithMinimumRelativeDistance(0.15). Build() stopWords := loadStopWords(stopWordsPath) return &Analyzer{ jieba: j, detector: detector, stopWords: stopWords, }, nil } // Close releases resources held by the analyzer. func (a *Analyzer) Close() { a.jieba.Free() } // loadStopWords reads a JSON array of stop-word strings. func loadStopWords(path string) map[string]bool { if path == "" { return map[string]bool{} } f, err := os.Open(path) if err != nil { return map[string]bool{} } defer f.Close() var words []string if err := json.NewDecoder(f).Decode(&words); err != nil { return map[string]bool{} } m := make(map[string]bool, len(words)) for _, w := range words { m[strings.ToLower(w)] = true } return m } // Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII. func (a *Analyzer) Tokenize(s string, searchMode bool) []string { if len(s) > 10000 { s = s[:10000] } // Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors. s = strings.ToValidUTF8(s, "") var result []string for _, part := range strings.Fields(s) { if isASCIIAlnum(part) { result = append(result, part) } else { a.mu.Lock() var tokens []string if searchMode { tokens = a.jieba.CutForSearch(part, true) } else { tokens = a.jieba.Cut(part, true) } a.mu.Unlock() result = append(result, tokens...) } } return result } // Normalize strips non-alphanumeric, non-CJK characters and lowercases. func Normalize(s string) string { var b strings.Builder for _, r := range s { if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) { if r >= 'A' && r <= 'Z' { b.WriteRune(unicode.ToLower(r)) } else { b.WriteRune(r) } } } return b.String() } // weightedTokens builds a map of token→weight from a text with an optional weight multiplier. func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 { tokens := a.Tokenize(text, false) d := make(map[string]float32) n := math.Max(8, float64(len(tokens))) counts := make(map[string]int) for _, t := range tokens { t = Normalize(t) if t == "" || a.stopWords[t] || len(t) > 32 { continue } counts[t]++ } for k, v := range counts { d[k] = float32(math.Min(0.2, float64(v)/n)) * w } return d } // Analyze extracts weighted keywords from title, description, and body text. // Returns a slice sorted by weight descending. func (a *Analyzer) Analyze(title, description, text string) []Keyword { maps := []map[string]float32{ a.weightedTokens(title, 1.0), a.weightedTokens(description, 0.5), a.weightedTokens(text, 1.0), } combined := make(map[string]float32) for _, m := range maps { for k := range m { combined[k] = 0 } } for k := range combined { for _, m := range maps { combined[k] += m[k] } } result := make([]Keyword, 0, len(combined)) for k, v := range combined { result = append(result, Keyword{Word: k, Weight: v}) } sortKeywords(result) return result } // Segment returns search-mode tokens for a query string. func (a *Analyzer) Segment(query string, searchMode bool) []string { tokens := a.Tokenize(query, searchMode) var result []string for _, t := range tokens { t = Normalize(t) if t == "" || a.stopWords[t] || len(t) > 32 { continue } result = append(result, t) } return result } // linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine. // Returns "" for unknown or unsupported languages. var linguaToISO639 = map[lingua.Language]string{ lingua.Chinese: "zh", lingua.English: "en", lingua.Japanese: "ja", lingua.Korean: "ko", lingua.French: "fr", lingua.German: "de", lingua.Spanish: "es", lingua.Portuguese: "pt", lingua.Italian: "it", lingua.Russian: "ru", lingua.Arabic: "ar", lingua.Hindi: "hi", lingua.Dutch: "nl", lingua.Polish: "pl", lingua.Swedish: "sv", lingua.Turkish: "tr", lingua.Vietnamese: "vi", lingua.Thai: "th", lingua.Indonesian: "id", lingua.Malay: "ms", } // DetectLanguage returns the ISO 639-1 language code for the text, or "". func (a *Analyzer) DetectLanguage(text string) string { text = strings.ReplaceAll(text, "\n", " ") if len(text) > 2000 { text = text[:2000] } if text == "" { return "" } lang, exists := a.detector.DetectLanguageOf(text) if !exists { return "" } if code, ok := linguaToISO639[lang]; ok { return code } return "" } // ---- sorting ---- func sortKeywords(kws []Keyword) { for i := 1; i < len(kws); i++ { key := kws[i] j := i - 1 for j >= 0 && kws[j].Weight < key.Weight { kws[j+1] = kws[j] j-- } kws[j+1] = key } } func isASCIIAlnum(s string) bool { for _, r := range s { if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) { return false } } return len(s) > 0 }