Signed-off-by: 吴文峰 <kevin@lmve.net>
This commit is contained in:
@@ -0,0 +1,250 @@
|
||||
// Package analyzer provides keyword extraction and language detection.
|
||||
//
|
||||
// Keyword extraction uses gojieba for Chinese segmentation and simple token
|
||||
// splitting for ASCII words. Language detection uses lingua-go (pure Go, no CGo).
|
||||
package analyzer
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"math"
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
"unicode"
|
||||
|
||||
"github.com/pemistahl/lingua-go"
|
||||
"github.com/yanyiwu/gojieba"
|
||||
)
|
||||
|
||||
// Keyword holds a (word, weight) pair.
|
||||
type Keyword struct {
|
||||
Word string `json:"word"`
|
||||
Weight float32 `json:"weight"`
|
||||
}
|
||||
|
||||
// Analyzer wraps jieba and lingua into a thread-safe analysis pipeline.
|
||||
type Analyzer struct {
|
||||
jieba *gojieba.Jieba
|
||||
detector lingua.LanguageDetector
|
||||
stopWords map[string]bool
|
||||
mu sync.Mutex // gojieba is not goroutine-safe
|
||||
}
|
||||
|
||||
// New creates an Analyzer.
|
||||
// stopWordsPath is the JSON file with punctuation/stop words (may be empty string).
|
||||
// modelPath is ignored (kept for API compatibility; lingua-go uses built-in data).
|
||||
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
||||
j := gojieba.NewJieba()
|
||||
|
||||
// Build a lingua detector that covers the languages we care about.
|
||||
// AllLanguages() covers 75 languages including Chinese, Japanese, Korean, etc.
|
||||
detector := lingua.NewLanguageDetectorBuilder().
|
||||
FromAllLanguages().
|
||||
WithMinimumRelativeDistance(0.15).
|
||||
Build()
|
||||
|
||||
stopWords := loadStopWords(stopWordsPath)
|
||||
|
||||
return &Analyzer{
|
||||
jieba: j,
|
||||
detector: detector,
|
||||
stopWords: stopWords,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Close releases resources held by the analyzer.
|
||||
func (a *Analyzer) Close() {
|
||||
a.jieba.Free()
|
||||
}
|
||||
|
||||
// loadStopWords reads a JSON array of stop-word strings.
|
||||
func loadStopWords(path string) map[string]bool {
|
||||
if path == "" {
|
||||
return map[string]bool{}
|
||||
}
|
||||
f, err := os.Open(path)
|
||||
if err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
defer f.Close()
|
||||
var words []string
|
||||
if err := json.NewDecoder(f).Decode(&words); err != nil {
|
||||
return map[string]bool{}
|
||||
}
|
||||
m := make(map[string]bool, len(words))
|
||||
for _, w := range words {
|
||||
m[strings.ToLower(w)] = true
|
||||
}
|
||||
return m
|
||||
}
|
||||
|
||||
// Tokenize segments a string into tokens using jieba for CJK and space-split for ASCII.
|
||||
func (a *Analyzer) Tokenize(s string, searchMode bool) []string {
|
||||
if len(s) > 10000 {
|
||||
s = s[:10000]
|
||||
}
|
||||
// Sanitize: replace invalid UTF-8 sequences so gojieba (C++) never sees decode errors.
|
||||
s = strings.ToValidUTF8(s, "")
|
||||
var result []string
|
||||
for _, part := range strings.Fields(s) {
|
||||
if isASCIIAlnum(part) {
|
||||
result = append(result, part)
|
||||
} else {
|
||||
a.mu.Lock()
|
||||
var tokens []string
|
||||
if searchMode {
|
||||
tokens = a.jieba.CutForSearch(part, true)
|
||||
} else {
|
||||
tokens = a.jieba.Cut(part, true)
|
||||
}
|
||||
a.mu.Unlock()
|
||||
result = append(result, tokens...)
|
||||
}
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// Normalize strips non-alphanumeric, non-CJK characters and lowercases.
|
||||
func Normalize(s string) string {
|
||||
var b strings.Builder
|
||||
for _, r := range s {
|
||||
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') || (r >= 0x4e00 && r <= 0x9fa5) {
|
||||
if r >= 'A' && r <= 'Z' {
|
||||
b.WriteRune(unicode.ToLower(r))
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
}
|
||||
return b.String()
|
||||
}
|
||||
|
||||
// weightedTokens builds a map of token→weight from a text with an optional weight multiplier.
|
||||
func (a *Analyzer) weightedTokens(text string, w float32) map[string]float32 {
|
||||
tokens := a.Tokenize(text, false)
|
||||
d := make(map[string]float32)
|
||||
n := math.Max(8, float64(len(tokens)))
|
||||
counts := make(map[string]int)
|
||||
for _, t := range tokens {
|
||||
t = Normalize(t)
|
||||
if t == "" || a.stopWords[t] || len(t) > 32 {
|
||||
continue
|
||||
}
|
||||
counts[t]++
|
||||
}
|
||||
for k, v := range counts {
|
||||
d[k] = float32(math.Min(0.2, float64(v)/n)) * w
|
||||
}
|
||||
return d
|
||||
}
|
||||
|
||||
// Analyze extracts weighted keywords from title, description, and body text.
|
||||
// Returns a slice sorted by weight descending.
|
||||
func (a *Analyzer) Analyze(title, description, text string) []Keyword {
|
||||
maps := []map[string]float32{
|
||||
a.weightedTokens(title, 1.0),
|
||||
a.weightedTokens(description, 0.5),
|
||||
a.weightedTokens(text, 1.0),
|
||||
}
|
||||
|
||||
combined := make(map[string]float32)
|
||||
for _, m := range maps {
|
||||
for k := range m {
|
||||
combined[k] = 0
|
||||
}
|
||||
}
|
||||
for k := range combined {
|
||||
for _, m := range maps {
|
||||
combined[k] += m[k]
|
||||
}
|
||||
}
|
||||
|
||||
result := make([]Keyword, 0, len(combined))
|
||||
for k, v := range combined {
|
||||
result = append(result, Keyword{Word: k, Weight: v})
|
||||
}
|
||||
|
||||
sortKeywords(result)
|
||||
return result
|
||||
}
|
||||
|
||||
// Segment returns search-mode tokens for a query string.
|
||||
func (a *Analyzer) Segment(query string, searchMode bool) []string {
|
||||
tokens := a.Tokenize(query, searchMode)
|
||||
var result []string
|
||||
for _, t := range tokens {
|
||||
t = Normalize(t)
|
||||
if t == "" || a.stopWords[t] || len(t) > 32 {
|
||||
continue
|
||||
}
|
||||
result = append(result, t)
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
// linguaToISO639 maps lingua.Language to the ISO 639-1 code used by the rest of the engine.
|
||||
// Returns "" for unknown or unsupported languages.
|
||||
var linguaToISO639 = map[lingua.Language]string{
|
||||
lingua.Chinese: "zh",
|
||||
lingua.English: "en",
|
||||
lingua.Japanese: "ja",
|
||||
lingua.Korean: "ko",
|
||||
lingua.French: "fr",
|
||||
lingua.German: "de",
|
||||
lingua.Spanish: "es",
|
||||
lingua.Portuguese: "pt",
|
||||
lingua.Italian: "it",
|
||||
lingua.Russian: "ru",
|
||||
lingua.Arabic: "ar",
|
||||
lingua.Hindi: "hi",
|
||||
lingua.Dutch: "nl",
|
||||
lingua.Polish: "pl",
|
||||
lingua.Swedish: "sv",
|
||||
lingua.Turkish: "tr",
|
||||
lingua.Vietnamese: "vi",
|
||||
lingua.Thai: "th",
|
||||
lingua.Indonesian: "id",
|
||||
lingua.Malay: "ms",
|
||||
}
|
||||
|
||||
// DetectLanguage returns the ISO 639-1 language code for the text, or "".
|
||||
func (a *Analyzer) DetectLanguage(text string) string {
|
||||
text = strings.ReplaceAll(text, "\n", " ")
|
||||
if len(text) > 2000 {
|
||||
text = text[:2000]
|
||||
}
|
||||
if text == "" {
|
||||
return ""
|
||||
}
|
||||
lang, exists := a.detector.DetectLanguageOf(text)
|
||||
if !exists {
|
||||
return ""
|
||||
}
|
||||
if code, ok := linguaToISO639[lang]; ok {
|
||||
return code
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
// ---- sorting ----
|
||||
|
||||
func sortKeywords(kws []Keyword) {
|
||||
for i := 1; i < len(kws); i++ {
|
||||
key := kws[i]
|
||||
j := i - 1
|
||||
for j >= 0 && kws[j].Weight < key.Weight {
|
||||
kws[j+1] = kws[j]
|
||||
j--
|
||||
}
|
||||
kws[j+1] = key
|
||||
}
|
||||
}
|
||||
|
||||
func isASCIIAlnum(s string) bool {
|
||||
for _, r := range s {
|
||||
if !((r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9')) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return len(s) > 0
|
||||
}
|
||||
Reference in New Issue
Block a user