251 lines
6.1 KiB
Go
251 lines
6.1 KiB
Go
// Package analyzer provides keyword extraction and language detection.
|
|
//
|
|
// Keyword extraction uses gojieba for Chinese segmentation and simple token
|
|
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
|
|
package analyzer
|
|
|
|
import (
|
|
"encoding/json"
|
|
"math"
|
|
"os"
|
|
"strings"
|
|
"sync"
|
|
"unicode"
|
|
|
|
"github.com/pemistahl/lingua-go"
|
|
"github.com/yanyiwu/gojieba"
|
|
)
|
|
|
|
// Keyword holds a (word, weight) pair.
|
|
type Keyword struct {
|
|
Word string `json:"word"`
|
|
Weight float32 `json:"weight"`
|
|
}
|
|
|
|
// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
|
|
type Analyzer struct {
|
|
jieba *gojieba.Jieba
|
|
detector lingua.LanguageDetector
|
|
stopWords map[string]bool
|
|
mu sync.Mutex // gojieba is not goroutine-safe
|
|
}
|
|
|
|
// New creates an Analyzer.
|
|
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
|
|
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
|
|
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
|
j := gojieba.NewJieba()
|
|
|
|
// Build a lingua detector that covers the languages we care about.
|
|
// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
|
|
detector := lingua.NewLanguageDetectorBuilder().
|
|
FromAllLanguages().
|
|
WithMinimumRelativeDistance(0.15).
|
|
Build()
|
|
|
|
stopWords := loadStopWords(stopWordsPath)
|
|
|
|
return &Analyzer{
|
|
jieba: j,
|
|
detector: detector,
|
|
stopWords: stopWords,
|
|
}, nil
|
|
}
|
|
|
|
// Close releases resources held by the analyzer.
|
|
func (a *Analyzer) Close() {
|
|
a.jieba.Free()
|
|
}
|
|
|
|
// loadStopWords reads a JSON array of stop-word strings.
|
|
func loadStopWords(path string) map[string]bool {
|
|
if path == "" {
|
|
return map[string]bool{}
|
|
}
|
|
f, err := os.Open(path)
|
|
if err != nil {
|
|
return map[string]bool{}
|
|
}
|
|
defer f.Close()
|
|
var words []string
|
|
if err := json.NewDecoder(f).Decode(&words); err != nil {
|
|
return map[string]bool{}
|
|
}
|
|
m := make(map[string]bool, len(words))
|
|
for _, w := range words {
|
|
m[strings.ToLower(w)] = true
|
|
}
|
|
return m
|
|
}
|
|
|
|
// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
|
|
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
|
|
if len(s) > 10000 {
|
|
s = s[:10000]
|
|
}
|
|
// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
|
|
s = strings.ToValidUTF8(s, "")
|
|
var result []string
|
|
for _, part := range strings.Fields(s) {
|
|
if isASCIIAlnum(part) {
|
|
result = append(result, part)
|
|
} else {
|
|
a.mu.Lock()
|
|
var tokens []string
|
|
if searchMode {
|
|
tokens = a.jieba.CutForSearch(part, true)
|
|
} else {
|
|
tokens = a.jieba.Cut(part, true)
|
|
}
|
|
a.mu.Unlock()
|
|
result = append(result, tokens...)
|
|
}
|
|
}
|
|
return result
|
|
}
|
|
|
|
// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
|
|
func Normalize(s string) string {
|
|
var b strings.Builder
|
|
for _, r := range s {
|
|
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
|
|
if r >= 'A' && r <= 'Z' {
|
|
b.WriteRune(unicode.ToLower(r))
|
|
} else {
|
|
b.WriteRune(r)
|
|
}
|
|
}
|
|
}
|
|
return b.String()
|
|
}
|
|
|
|
// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
|
|
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
|
|
tokens := a.Tokenize(text, false)
|
|
d := make(map[string]float32)
|
|
n := math.Max(8, float64(len(tokens)))
|
|
counts := make(map[string]int)
|
|
for _, t := range tokens {
|
|
t = Normalize(t)
|
|
if t == "" || a.stopWords[t] || len(t) > 32 {
|
|
continue
|
|
}
|
|
counts[t]++
|
|
}
|
|
for k, v := range counts {
|
|
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
|
|
}
|
|
return d
|
|
}
|
|
|
|
// Analyze extracts weighted keywords from title, description, and body text.
|
|
// Returns a slice sorted by weight descending.
|
|
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
|
|
maps := []map[string]float32{
|
|
a.weightedTokens(title, 1.0),
|
|
a.weightedTokens(description, 0.5),
|
|
a.weightedTokens(text, 1.0),
|
|
}
|
|
|
|
combined := make(map[string]float32)
|
|
for _, m := range maps {
|
|
for k := range m {
|
|
combined[k] = 0
|
|
}
|
|
}
|
|
for k := range combined {
|
|
for _, m := range maps {
|
|
combined[k] += m[k]
|
|
}
|
|
}
|
|
|
|
result := make([]Keyword, 0, len(combined))
|
|
for k, v := range combined {
|
|
result = append(result, Keyword{Word: k, Weight: v})
|
|
}
|
|
|
|
sortKeywords(result)
|
|
return result
|
|
}
|
|
|
|
// Segment returns search-mode tokens for a query string.
|
|
func (a *Analyzer) Segment(query string, searchMode bool) []string {
|
|
tokens := a.Tokenize(query, searchMode)
|
|
var result []string
|
|
for _, t := range tokens {
|
|
t = Normalize(t)
|
|
if t == "" || a.stopWords[t] || len(t) > 32 {
|
|
continue
|
|
}
|
|
result = append(result, t)
|
|
}
|
|
return result
|
|
}
|
|
|
|
// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
|
|
// Returns "" for unknown or unsupported languages.
|
|
var linguaToISO639 = map[lingua.Language]string{
|
|
lingua.Chinese: "zh",
|
|
lingua.English: "en",
|
|
lingua.Japanese: "ja",
|
|
lingua.Korean: "ko",
|
|
lingua.French: "fr",
|
|
lingua.German: "de",
|
|
lingua.Spanish: "es",
|
|
lingua.Portuguese: "pt",
|
|
lingua.Italian: "it",
|
|
lingua.Russian: "ru",
|
|
lingua.Arabic: "ar",
|
|
lingua.Hindi: "hi",
|
|
lingua.Dutch: "nl",
|
|
lingua.Polish: "pl",
|
|
lingua.Swedish: "sv",
|
|
lingua.Turkish: "tr",
|
|
lingua.Vietnamese: "vi",
|
|
lingua.Thai: "th",
|
|
lingua.Indonesian: "id",
|
|
lingua.Malay: "ms",
|
|
}
|
|
|
|
// DetectLanguage returns the ISO 639-1 language code for the text, or "".
|
|
func (a *Analyzer) DetectLanguage(text string) string {
|
|
text = strings.ReplaceAll(text, "\n", " ")
|
|
if len(text) > 2000 {
|
|
text = text[:2000]
|
|
}
|
|
if text == "" {
|
|
return ""
|
|
}
|
|
lang, exists := a.detector.DetectLanguageOf(text)
|
|
if !exists {
|
|
return ""
|
|
}
|
|
if code, ok := linguaToISO639[lang]; ok {
|
|
return code
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ---- sorting ----
|
|
|
|
func sortKeywords(kws []Keyword) {
|
|
for i := 1; i < len(kws); i++ {
|
|
key := kws[i]
|
|
j := i - 1
|
|
for j >= 0 && kws[j].Weight < key.Weight {
|
|
kws[j+1] = kws[j]
|
|
j--
|
|
}
|
|
kws[j+1] = key
|
|
}
|
|
}
|
|
|
|
func isASCIIAlnum(s string) bool {
|
|
for _, r := range s {
|
|
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
|
|
return false
|
|
}
|
|
}
|
|
return len(s) > 0
|
|
}
|