# README
gse
Go efficient multilingual NLP and text segmentation; support English, Chinese, Japanese and others. And supports with elasticsearch and bleve.
Gse is implements jieba by golang, and try add NLP support and more feature
Feature:
- Support common, search engine, full mode, precise mode and HMM mode multiple word segmentation modes;
- Support user and embed dictionary, Part-of-speech/POS tagging, analyze segment info, stop and trim words
- Support multilingual: English, Chinese, Japanese and others
- Support Traditional Chinese
- Support HMM cut text use Viterbi algorithm
- Support NLP by TensorFlow (in work)
- Named Entity Recognition (in work)
- Supports with elasticsearch and bleve
- run JSON RPC service.
Algorithm:
- Dictionary with double array trie (Double-Array Trie) to achieve
- Segmenter algorithm is the shortest path (based on word frequency and dynamic programming), and DAG and HMM algorithm word segmentation.
Text Segmentation speed:
- single thread 9.2MB/s
- goroutines concurrent 26.8MB/s.
- HMM text segmentation single thread 3.2MB/s. (2core 4threads Macbook Pro).
Binding:
gse-bind, binding JavaScript and other, support more language.
Install / update
With Go module support (Go 1.11+), just import:
import "github.com/go-ego/gse"
Otherwise, to install the gse package, run the command:
go get -u github.com/go-ego/gse
Use
package main
import (
_ "embed"
"fmt"
"github.com/go-ego/gse"
)
//go:embed testdata/test_en2.txt
var testDict string
//go:embed testdata/test_en.txt
var testEn string
var (
text = "To be or not to be, that's the question!"
test1 = "Hiworld, Helloworld!"
)
func main() {
var seg1 gse.Segmenter
seg1.DictSep = ","
err := seg1.LoadDict("./testdata/test_en.txt")
if err != nil {
fmt.Println("Load dictionary error: ", err)
}
s1 := seg1.Cut(text)
fmt.Println("seg1 Cut: ", s1)
// seg1 Cut: [to be or not to be , that's the question!]
var seg2 gse.Segmenter
seg2.AlphaNum = true
seg2.LoadDict("./testdata/test_en_dict3.txt")
s2 := seg2.Cut(test1)
fmt.Println("seg2 Cut: ", s2)
// seg2 Cut: [hi world , hello world !]
var seg3 gse.Segmenter
seg3.AlphaNum = true
seg3.DictSep = ","
err = seg3.LoadDictEmbed(testDict + "\n" + testEn)
if err != nil {
fmt.Println("loadDictEmbed error: ", err)
}
s3 := seg3.Cut(text + test1)
fmt.Println("seg3 Cut: ", s3)
// seg3 Cut: [to be or not to be , that's the question! hi world , hello world !]
// example2()
}
Example2:
package main
import (
"fmt"
"regexp"
"github.com/go-ego/gse"
"github.com/go-ego/gse/hmm/pos"
)
var (
text = "Hello world, Helloworld. Winter is coming! こんにちは世界, 你好世界."
new, _ = gse.New("zh,testdata/test_en_dict3.txt", "alpha")
seg gse.Segmenter
posSeg pos.Segmenter
)
func main() {
// Loading the default dictionary
seg.LoadDict()
// Loading the default dictionary with embed
// seg.LoadDictEmbed()
//
// Loading the Simplified Chinese dictionary
// seg.LoadDict("zh_s")
// seg.LoadDictEmbed("zh_s")
//
// Loading the Traditional Chinese dictionary
// seg.LoadDict("zh_t")
//
// Loading the Japanese dictionary
// seg.LoadDict("jp")
//
// Load the dictionary
// seg.LoadDict("your gopath"+"/src/github.com/go-ego/gse/data/dict/dictionary.txt")
cut()
segCut()
}
func cut() {
hmm := new.Cut(text, true)
fmt.Println("cut use hmm: ", hmm)
hmm = new.CutSearch(text, true)
fmt.Println("cut search use hmm: ", hmm)
fmt.Println("analyze: ", new.Analyze(hmm, text))
hmm = new.CutAll(text)
fmt.Println("cut all: ", hmm)
reg := regexp.MustCompile(`(\d+年|\d+月|\d+日|[\p{Latin}]+|[\p{Hangul}]+|\d+\.\d+|[a-zA-Z0-9]+)`)
text1 := `헬로월드 헬로 서울, 2021年09月10日, 3.14`
hmm = seg.CutDAG(text1, reg)
fmt.Println("Cut with hmm and regexp: ", hmm, hmm[0], hmm[6])
}
func analyzeAndTrim(cut []string) {
a := seg.Analyze(cut, "")
fmt.Println("analyze the segment: ", a)
cut = seg.Trim(cut)
fmt.Println("cut all: ", cut)
fmt.Println(seg.String(text, true))
fmt.Println(seg.Slice(text, true))
}
func cutPos() {
po := seg.Pos(text, true)
fmt.Println("pos: ", po)
po = seg.TrimPos(po)
fmt.Println("trim pos: ", po)
pos.WithGse(seg)
po = posSeg.Cut(text, true)
fmt.Println("pos: ", po)
po = posSeg.TrimWithPos(po, "zg")
fmt.Println("trim pos: ", po)
}
func segCut() {
// Text Segmentation
tb := []byte(text)
fmt.Println(seg.String(text, true))
segments := seg.Segment(tb)
// Handle word segmentation results, search mode
fmt.Println(gse.ToString(segments, true))
}
Look at an custom dictionary example
package main
import (
"fmt"
_ "embed"
"github.com/go-ego/gse"
)
//go:embed test_en_dict3.txt
var testDict string
func main() {
// var seg gse.Segmenter
// seg.LoadDict("zh, testdata/zh/test_dict.txt, testdata/zh/test_dict1.txt")
// seg.LoadStop()
seg, err := gse.NewEmbed("zh, word 20 n"+testDict, "en")
// seg.LoadDictEmbed()
seg.LoadStopEmbed()
text1 := "Hello world, こんにちは世界, 你好世界!"
s1 := seg.Cut(text1, true)
fmt.Println(s1)
fmt.Println("trim: ", seg.Trim(s1))
fmt.Println("stop: ", seg.Stop(s1))
fmt.Println(seg.String(text1, true))
segments := seg.Segment([]byte(text1))
fmt.Println(gse.ToString(segments))
}
Elasticsearch
How to use it with elasticsearch?
Authors
License
Gse is primarily distributed under the terms of "both the MIT license and the Apache License (Version 2.0)". See LICENSE-APACHE, LICENSE-MIT.
# Functions
DictPaths get the dict's paths.
FilterEmoji filter the emoji.
FilterHtml filter the html tag.
FilterLang filter the language.
FilterSymbol filter the symbol.
FindAllOccs find the all search byte start in data.
GetVersion get the version of gse.
IsJp is Japan char return true.
Join is better string splicing.
New return a new gse segmenter.
NewDict a new dictionary trie.
NewEmbed return new gse segmenter by embed dictionary.
Range range text to []string.
RangeText range text to string.
SplitNum cut string by num to []string.
SplitNums cut string by num to string.
SplitWords splits a string to token words.
ToPos converts a segments slice to []SegPos.
ToSlice converts a segments to slice return string slice.
ToString converts a segments slice to string return the string
two output modes:
normal mode (searchMode=false) search mode(searchMode=true)
default searchMode=false search mode is used search engine, and will output more results.
# Constants
RatioWord ratio words and letters.
RatioWordFull full ratio words and letters.
Version get the gse version.
# Variables
StopWordMap the default stop words.
ToLower set alpha to lowercase.
go:embed data/dict/zh/idf.txt.
# Structs
AnalyzeToken analyze the segment info structure.
Dictionary struct implements a string double array trie.
Prob define the hmm model struct.
Segment a segment in the text.
Segmenter define the segmenter structure.
SegPos type a POS struct.
Token define a segment token structure.
# Type aliases
Text a string type,used to parse text 1.