改为本地字典
This commit is contained in:
+16
-1
@@ -9,6 +9,7 @@ import (
|
||||
"encoding/json" // JSON 反序列化(加载屏蔽词列表)
|
||||
"math" // 数学运算(最小值、开方)
|
||||
"os" // 文件系统操作(读取屏蔽词文件)
|
||||
"path/filepath" // 路径处理
|
||||
"strings" // 字符串操作
|
||||
"sync" // 互斥锁(保护 jieba 的非线程安全调用)
|
||||
"unicode" // Unicode 字符判断
|
||||
@@ -76,7 +77,21 @@ type Analyzer struct {
|
||||
// stopWordsPath:屏蔽词 JSON 文件路径(不含文件时传入空字符串)。
|
||||
func New(modelPath, stopWordsPath string) (*Analyzer, error) {
|
||||
// 初始化结巴分词(加载词典,需调用 Free 释放)
|
||||
j := gojieba.NewJieba()
|
||||
// 词典路径:优先使用 ./dict/,否则回退到 go module 缓存
|
||||
dictPath := filepath.Join(".", "dict")
|
||||
jiebaDict := filepath.Join(dictPath, "jieba.dict.utf8")
|
||||
hmmModel := filepath.Join(dictPath, "hmm_model.utf8")
|
||||
posDict := filepath.Join(dictPath, "pos_dict.utf8")
|
||||
idenDict := filepath.Join(dictPath, "iden_dict.utf8")
|
||||
|
||||
var j *gojieba.Jieba
|
||||
if _, err := os.Stat(jiebaDict); err == nil {
|
||||
// 词典存在,使用本地路径
|
||||
j = gojieba.NewJieba(jiebaDict, hmmModel, posDict, idenDict)
|
||||
} else {
|
||||
// 词典不存在,使用默认路径(go module 缓存)
|
||||
j = gojieba.NewJieba()
|
||||
}
|
||||
|
||||
// 构建 lingua 语言检测器,覆盖所有 75 种语言(含中日韩英等)
|
||||
// MinimumRelativeDistance=0.15:降低检测阈值,提高短文本召回率
|
||||
|
||||
@@ -0,0 +1,31 @@
|
||||
# CppJieba字典
|
||||
|
||||
文件后缀名代表的是词典的编码方式。
|
||||
比如filename.utf8 是 utf8编码,filename.gbk 是 gbk编码方式。
|
||||
|
||||
|
||||
## 分词
|
||||
|
||||
### jieba.dict.utf8/gbk
|
||||
|
||||
作为最大概率法(MPSegment: Max Probability)分词所使用的词典。
|
||||
|
||||
### hmm_model.utf8/gbk
|
||||
|
||||
作为隐式马尔科夫模型(HMMSegment: Hidden Markov Model)分词所使用的词典。
|
||||
|
||||
__对于MixSegment(混合MPSegment和HMMSegment两者)则同时使用以上两个词典__
|
||||
|
||||
|
||||
## 关键词抽取
|
||||
|
||||
### idf.utf8
|
||||
|
||||
IDF(Inverse Document Frequency)
|
||||
在KeywordExtractor中,使用的是经典的TF-IDF算法,所以需要这么一个词典提供IDF信息。
|
||||
|
||||
### stop_words.utf8
|
||||
|
||||
停用词词典
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
+258826
File diff suppressed because it is too large
Load Diff
+348982
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
@@ -0,0 +1,259 @@
|
||||
#初始状态的概率
|
||||
#格式
|
||||
#状态:概率
|
||||
B,a:-4.7623052146
|
||||
B,ad:-6.68006603678
|
||||
B,ag:-3.14e+100
|
||||
B,an:-8.69708322302
|
||||
B,b:-5.01837436211
|
||||
B,bg:-3.14e+100
|
||||
B,c:-3.42388018495
|
||||
B,d:-3.97504752976
|
||||
B,df:-8.88897423083
|
||||
B,dg:-3.14e+100
|
||||
B,e:-8.56355183039
|
||||
B,en:-3.14e+100
|
||||
B,f:-5.49163041848
|
||||
B,g:-3.14e+100
|
||||
B,h:-13.53336513
|
||||
B,i:-6.11578472756
|
||||
B,in:-3.14e+100
|
||||
B,j:-5.05761912847
|
||||
B,jn:-3.14e+100
|
||||
B,k:-3.14e+100
|
||||
B,l:-4.90588358466
|
||||
B,ln:-3.14e+100
|
||||
B,m:-3.6524299819
|
||||
B,mg:-3.14e+100
|
||||
B,mq:-6.7869530014
|
||||
B,n:-1.69662577975
|
||||
B,ng:-3.14e+100
|
||||
B,nr:-2.23104959138
|
||||
B,nrfg:-5.87372217541
|
||||
B,nrt:-4.98564273352
|
||||
B,ns:-2.8228438315
|
||||
B,nt:-4.84609166818
|
||||
B,nz:-3.94698846058
|
||||
B,o:-8.43349870215
|
||||
B,p:-4.20098413209
|
||||
B,q:-6.99812385896
|
||||
B,qe:-3.14e+100
|
||||
B,qg:-3.14e+100
|
||||
B,r:-3.40981877908
|
||||
B,rg:-3.14e+100
|
||||
B,rr:-12.4347528413
|
||||
B,rz:-7.94611647157
|
||||
B,s:-5.52267359084
|
||||
B,t:-3.36474790945
|
||||
B,tg:-3.14e+100
|
||||
B,u:-9.1639172775
|
||||
B,ud:-3.14e+100
|
||||
B,ug:-3.14e+100
|
||||
B,uj:-3.14e+100
|
||||
B,ul:-3.14e+100
|
||||
B,uv:-3.14e+100
|
||||
B,uz:-3.14e+100
|
||||
B,v:-2.67405848743
|
||||
B,vd:-9.04472876024
|
||||
B,vg:-3.14e+100
|
||||
B,vi:-12.4347528413
|
||||
B,vn:-4.33156108902
|
||||
B,vq:-12.1470707689
|
||||
B,w:-3.14e+100
|
||||
B,x:-3.14e+100
|
||||
B,y:-9.84448567586
|
||||
B,yg:-3.14e+100
|
||||
B,z:-7.04568111149
|
||||
B,zg:-3.14e+100
|
||||
E,a:-3.14e+100
|
||||
E,ad:-3.14e+100
|
||||
E,ag:-3.14e+100
|
||||
E,an:-3.14e+100
|
||||
E,b:-3.14e+100
|
||||
E,bg:-3.14e+100
|
||||
E,c:-3.14e+100
|
||||
E,d:-3.14e+100
|
||||
E,df:-3.14e+100
|
||||
E,dg:-3.14e+100
|
||||
E,e:-3.14e+100
|
||||
E,en:-3.14e+100
|
||||
E,f:-3.14e+100
|
||||
E,g:-3.14e+100
|
||||
E,h:-3.14e+100
|
||||
E,i:-3.14e+100
|
||||
E,in:-3.14e+100
|
||||
E,j:-3.14e+100
|
||||
E,jn:-3.14e+100
|
||||
E,k:-3.14e+100
|
||||
E,l:-3.14e+100
|
||||
E,ln:-3.14e+100
|
||||
E,m:-3.14e+100
|
||||
E,mg:-3.14e+100
|
||||
E,mq:-3.14e+100
|
||||
E,n:-3.14e+100
|
||||
E,ng:-3.14e+100
|
||||
E,nr:-3.14e+100
|
||||
E,nrfg:-3.14e+100
|
||||
E,nrt:-3.14e+100
|
||||
E,ns:-3.14e+100
|
||||
E,nt:-3.14e+100
|
||||
E,nz:-3.14e+100
|
||||
E,o:-3.14e+100
|
||||
E,p:-3.14e+100
|
||||
E,q:-3.14e+100
|
||||
E,qe:-3.14e+100
|
||||
E,qg:-3.14e+100
|
||||
E,r:-3.14e+100
|
||||
E,rg:-3.14e+100
|
||||
E,rr:-3.14e+100
|
||||
E,rz:-3.14e+100
|
||||
E,s:-3.14e+100
|
||||
E,t:-3.14e+100
|
||||
E,tg:-3.14e+100
|
||||
E,u:-3.14e+100
|
||||
E,ud:-3.14e+100
|
||||
E,ug:-3.14e+100
|
||||
E,uj:-3.14e+100
|
||||
E,ul:-3.14e+100
|
||||
E,uv:-3.14e+100
|
||||
E,uz:-3.14e+100
|
||||
E,v:-3.14e+100
|
||||
E,vd:-3.14e+100
|
||||
E,vg:-3.14e+100
|
||||
E,vi:-3.14e+100
|
||||
E,vn:-3.14e+100
|
||||
E,vq:-3.14e+100
|
||||
E,w:-3.14e+100
|
||||
E,x:-3.14e+100
|
||||
E,y:-3.14e+100
|
||||
E,yg:-3.14e+100
|
||||
E,z:-3.14e+100
|
||||
E,zg:-3.14e+100
|
||||
M,a:-3.14e+100
|
||||
M,ad:-3.14e+100
|
||||
M,ag:-3.14e+100
|
||||
M,an:-3.14e+100
|
||||
M,b:-3.14e+100
|
||||
M,bg:-3.14e+100
|
||||
M,c:-3.14e+100
|
||||
M,d:-3.14e+100
|
||||
M,df:-3.14e+100
|
||||
M,dg:-3.14e+100
|
||||
M,e:-3.14e+100
|
||||
M,en:-3.14e+100
|
||||
M,f:-3.14e+100
|
||||
M,g:-3.14e+100
|
||||
M,h:-3.14e+100
|
||||
M,i:-3.14e+100
|
||||
M,in:-3.14e+100
|
||||
M,j:-3.14e+100
|
||||
M,jn:-3.14e+100
|
||||
M,k:-3.14e+100
|
||||
M,l:-3.14e+100
|
||||
M,ln:-3.14e+100
|
||||
M,m:-3.14e+100
|
||||
M,mg:-3.14e+100
|
||||
M,mq:-3.14e+100
|
||||
M,n:-3.14e+100
|
||||
M,ng:-3.14e+100
|
||||
M,nr:-3.14e+100
|
||||
M,nrfg:-3.14e+100
|
||||
M,nrt:-3.14e+100
|
||||
M,ns:-3.14e+100
|
||||
M,nt:-3.14e+100
|
||||
M,nz:-3.14e+100
|
||||
M,o:-3.14e+100
|
||||
M,p:-3.14e+100
|
||||
M,q:-3.14e+100
|
||||
M,qe:-3.14e+100
|
||||
M,qg:-3.14e+100
|
||||
M,r:-3.14e+100
|
||||
M,rg:-3.14e+100
|
||||
M,rr:-3.14e+100
|
||||
M,rz:-3.14e+100
|
||||
M,s:-3.14e+100
|
||||
M,t:-3.14e+100
|
||||
M,tg:-3.14e+100
|
||||
M,u:-3.14e+100
|
||||
M,ud:-3.14e+100
|
||||
M,ug:-3.14e+100
|
||||
M,uj:-3.14e+100
|
||||
M,ul:-3.14e+100
|
||||
M,uv:-3.14e+100
|
||||
M,uz:-3.14e+100
|
||||
M,v:-3.14e+100
|
||||
M,vd:-3.14e+100
|
||||
M,vg:-3.14e+100
|
||||
M,vi:-3.14e+100
|
||||
M,vn:-3.14e+100
|
||||
M,vq:-3.14e+100
|
||||
M,w:-3.14e+100
|
||||
M,x:-3.14e+100
|
||||
M,y:-3.14e+100
|
||||
M,yg:-3.14e+100
|
||||
M,z:-3.14e+100
|
||||
M,zg:-3.14e+100
|
||||
S,a:-3.90253968313
|
||||
S,ad:-11.0484584802
|
||||
S,ag:-6.95411391796
|
||||
S,an:-12.8402179494
|
||||
S,b:-6.47288876397
|
||||
S,bg:-3.14e+100
|
||||
S,c:-4.78696679586
|
||||
S,d:-3.90391976418
|
||||
S,df:-3.14e+100
|
||||
S,dg:-8.9483976513
|
||||
S,e:-5.94251300628
|
||||
S,en:-3.14e+100
|
||||
S,f:-5.19482024998
|
||||
S,g:-6.50782681533
|
||||
S,h:-8.65056320738
|
||||
S,i:-3.14e+100
|
||||
S,in:-3.14e+100
|
||||
S,j:-4.91199211964
|
||||
S,jn:-3.14e+100
|
||||
S,k:-6.94032059583
|
||||
S,l:-3.14e+100
|
||||
S,ln:-3.14e+100
|
||||
S,m:-3.26920065212
|
||||
S,mg:-10.8253149289
|
||||
S,mq:-3.14e+100
|
||||
S,n:-3.85514838976
|
||||
S,ng:-4.9134348611
|
||||
S,nr:-4.48366310396
|
||||
S,nrfg:-3.14e+100
|
||||
S,nrt:-3.14e+100
|
||||
S,ns:-3.14e+100
|
||||
S,nt:-12.1470707689
|
||||
S,nz:-3.14e+100
|
||||
S,o:-8.46446092775
|
||||
S,p:-2.98684018136
|
||||
S,q:-4.88865861826
|
||||
S,qe:-3.14e+100
|
||||
S,qg:-3.14e+100
|
||||
S,r:-2.76353367841
|
||||
S,rg:-10.2752685919
|
||||
S,rr:-3.14e+100
|
||||
S,rz:-3.14e+100
|
||||
S,s:-3.14e+100
|
||||
S,t:-3.14e+100
|
||||
S,tg:-6.27284253188
|
||||
S,u:-6.94032059583
|
||||
S,ud:-7.72823016105
|
||||
S,ug:-7.53940370266
|
||||
S,uj:-6.85251045118
|
||||
S,ul:-8.41537131755
|
||||
S,uv:-8.15808672229
|
||||
S,uz:-9.29925862537
|
||||
S,v:-3.05329230341
|
||||
S,vd:-3.14e+100
|
||||
S,vg:-5.94301818437
|
||||
S,vi:-3.14e+100
|
||||
S,vn:-11.4539235883
|
||||
S,vq:-3.14e+100
|
||||
S,w:-3.14e+100
|
||||
S,x:-8.42741965607
|
||||
S,y:-6.19707946995
|
||||
S,yg:-13.53336513
|
||||
S,z:-3.14e+100
|
||||
S,zg:-3.14e+100
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,4 @@
|
||||
云计算
|
||||
韩玉鉴赏
|
||||
蓝翔 nz
|
||||
区块链 10 nz
|
||||
@@ -103,6 +103,7 @@ echo "==> 部署到 $APP_PATH..."
|
||||
mkdir -p "$APP_PATH"
|
||||
cp "$PROJECT_DIR/$APP_NAME" "$APP_PATH/"
|
||||
cp -r "$PROJECT_DIR/dist" "$APP_PATH/"
|
||||
cp -r "$PROJECT_DIR/dict" "$APP_PATH/"
|
||||
|
||||
echo "==> 配置用户..."
|
||||
if id -u www &> /dev/null; then
|
||||
|
||||
Reference in New Issue
Block a user