123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530 |
- // Copyright 2013 Hui Chen
- // Copyright 2016 ego authors
- //
- // Licensed under the Apache License, Version 2.0 (the "License"): you may
- // not use this file except in compliance with the License. You may obtain
- // a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- // License for the specific language governing permissions and limitations
- // under the License.
- /*
- package gse Go efficient text segmentation, Go 语言分词
- */
- package gse
- import (
- "bufio"
- "fmt"
- "io"
- "log"
- "math"
- "os"
- "path"
- "runtime"
- "strconv"
- "strings"
- "unicode"
- "unicode/utf8"
- )
- const (
- version string = "v0.10.0.106, Danube River!"
- minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
- )
- // GetVersion get the gse version
- func GetVersion() string {
- return version
- }
- // Segmenter 分词器结构体
- type Segmenter struct {
- dict *Dictionary
- }
- // jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息
- type jumper struct {
- minDistance float32
- token *Token
- }
- // Dictionary 返回分词器使用的词典
- func (seg *Segmenter) Dictionary() *Dictionary {
- return seg.dict
- }
- // getCurrentFilePath get current file path
- func getCurrentFilePath() string {
- _, filePath, _, _ := runtime.Caller(1)
- return filePath
- }
- // Read read the dict flie
- func (seg *Segmenter) Read(file string) error {
- log.Printf("Load the gse dictionary: \"%s\" ", file)
- dictFile, err := os.Open(file)
- if err != nil {
- log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
- return err
- }
- defer dictFile.Close()
- reader := bufio.NewReader(dictFile)
- var (
- text string
- freqText string
- frequency int
- pos string
- )
- // 逐行读入分词
- line := 0
- for {
- line++
- size, fsErr := fmt.Fscanln(reader, &text, &freqText, &pos)
- if fsErr != nil {
- if fsErr == io.EOF {
- // End of file
- break
- }
- if size > 0 {
- log.Printf("File '%v' line \"%v\" read error: %v, skip",
- file, line, fsErr.Error())
- } else {
- log.Printf("File '%v' line \"%v\" is empty, read error: %v, skip",
- file, line, fsErr.Error())
- }
- }
- if size == 0 {
- // 文件结束或错误行
- // break
- continue
- } else if size < 2 {
- // 无效行
- continue
- } else if size == 2 {
- // 没有词性标注时设为空字符串
- pos = ""
- }
- // 解析词频
- var err error
- frequency, err = strconv.Atoi(freqText)
- if err != nil {
- continue
- }
- // 过滤频率太小的词
- if frequency < minTokenFrequency {
- continue
- }
- // 过滤, 降低词频
- if len([]rune(text)) < 2 {
- // continue
- frequency = 2
- }
- // 将分词添加到字典中
- words := splitTextToWords([]byte(text))
- token := Token{text: words, frequency: frequency, pos: pos}
- seg.dict.addToken(token)
- }
- return nil
- }
- // DictPaths get the dict's paths
- func DictPaths(dictDir, filePath string) (files []string) {
- var dictPath string
- if filePath == "en" {
- return
- }
- if filePath == "zh" {
- dictPath = path.Join(dictDir, "dict/dictionary.txt")
- files = []string{dictPath}
- return
- }
- if filePath == "jp" {
- dictPath = path.Join(dictDir, "dict/jp/dict.txt")
- files = []string{dictPath}
- return
- }
- // if strings.Contains(filePath, ",") {
- fileName := strings.Split(filePath, ",")
- for i := 0; i < len(fileName); i++ {
- if fileName[i] == "jp" {
- dictPath = path.Join(dictDir, "dict/jp/dict.txt")
- }
- if fileName[i] == "zh" {
- dictPath = path.Join(dictDir, "dict/dictionary.txt")
- }
- // if str[i] == "ti" {
- // }
- dictName := fileName[i] != "en" && fileName[i] != "zh" &&
- fileName[i] != "jp" && fileName[i] != "ti"
- if dictName {
- dictPath = fileName[i]
- }
- if dictPath != "" {
- files = append(files, dictPath)
- }
- }
- // }
- log.Println("Dict files path: ", files)
- return
- }
- // IsJp is jp char return true
- func IsJp(segText string) bool {
- for _, r := range segText {
- jp := unicode.Is(unicode.Scripts["Hiragana"], r) ||
- unicode.Is(unicode.Scripts["Katakana"], r)
- if jp {
- return true
- }
- }
- return false
- }
- // SegToken add segmenter token
- func (seg *Segmenter) SegToken() {
- // 计算每个分词的路径值,路径值含义见 Token 结构体的注释
- logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
- for i := range seg.dict.tokens {
- token := &seg.dict.tokens[i]
- token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
- }
- // 对每个分词进行细致划分,用于搜索引擎模式,
- // 该模式用法见 Token 结构体的注释。
- for i := range seg.dict.tokens {
- token := &seg.dict.tokens[i]
- segments := seg.segmentWords(token.text, true)
- // 计算需要添加的子分词数目
- numTokensToAdd := 0
- for iToken := 0; iToken < len(segments); iToken++ {
- // if len(segments[iToken].token.text) > 1 {
- // 略去字元长度为一的分词
- // TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候
- if len(segments[iToken].token.text) > 0 {
- hasJp := false
- if len(segments[iToken].token.text) == 1 {
- segText := string(segments[iToken].token.text[0])
- hasJp = IsJp(segText)
- }
- if !hasJp {
- numTokensToAdd++
- }
- }
- }
- token.segments = make([]*Segment, numTokensToAdd)
- // 添加子分词
- iSegmentsToAdd := 0
- for iToken := 0; iToken < len(segments); iToken++ {
- // if len(segments[iToken].token.text) > 1 {
- if len(segments[iToken].token.text) > 0 {
- hasJp := false
- if len(segments[iToken].token.text) == 1 {
- segText := string(segments[iToken].token.text[0])
- hasJp = IsJp(segText)
- }
- if !hasJp {
- token.segments[iSegmentsToAdd] = &segments[iToken]
- iSegmentsToAdd++
- }
- }
- }
- }
- }
- // LoadDict load the dictionary from the file
- //
- // The format of the dictionary is (one for each participle):
- // participle text, frequency, part of speech
- //
- // Can load multiple dictionary files, the file name separated by ","
- // the front of the dictionary preferentially load the participle,
- // such as: "user_dictionary.txt,common_dictionary.txt"
- // When a participle appears both in the user dictionary and
- // in the `common dictionary`, the `user dictionary` is given priority.
- //
- // 从文件中载入词典
- //
- // 可以载入多个词典文件,文件名用 "," 分隔,排在前面的词典优先载入分词,比如:
- // "用户词典.txt,通用词典.txt"
- // 当一个分词既出现在用户词典也出现在 `通用词典` 中,则优先使用 `用户词典`。
- //
- // 词典的格式为(每个分词一行):
- // 分词文本 频率 词性
- func (seg *Segmenter) LoadDict(files ...string) error {
- seg.dict = NewDict()
- var (
- dictDir = path.Join(path.Dir(getCurrentFilePath()), "data")
- dictPath string
- // load bool
- )
- if len(files) > 0 {
- dictFiles := DictPaths(dictDir, files[0])
- if len(dictFiles) > 0 {
- // load = true
- // files = dictFiles
- for i := 0; i < len(dictFiles); i++ {
- err := seg.Read(dictFiles[i])
- if err != nil {
- return err
- }
- }
- }
- }
- if len(files) == 0 {
- dictPath = path.Join(dictDir, "dict/dictionary.txt")
- // files = []string{dictPath}
- err := seg.Read(dictPath)
- if err != nil {
- return err
- }
- }
- // if files[0] != "" && files[0] != "en" && !load {
- // for _, file := range strings.Split(files[0], ",") {
- // // for _, file := range files {
- // err := seg.Read(file)
- // if err != nil {
- // return err
- // }
- // }
- // }
- seg.SegToken()
- log.Println("Gse dictionary loaded finished.")
- return nil
- }
- // Segment 对文本分词
- //
- // 输入参数:
- // bytes UTF8 文本的字节数组
- //
- // 输出:
- // []Segment 划分的分词
- func (seg *Segmenter) Segment(bytes []byte) []Segment {
- return seg.internalSegment(bytes, false)
- }
- // ModeSegment segment using search mode if searchMode is true
- func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
- var mode bool
- if len(searchMode) > 0 {
- mode = searchMode[0]
- }
- return seg.internalSegment(bytes, mode)
- }
- // Slice use modeSegment segment retrun []string
- // using search mode if searchMode is true
- func (seg *Segmenter) Slice(bytes []byte, searchMode ...bool) []string {
- segs := seg.ModeSegment(bytes, searchMode...)
- return ToSlice(segs, searchMode...)
- }
- // Slice use modeSegment segment retrun string
- // using search mode if searchMode is true
- func (seg *Segmenter) String(bytes []byte, searchMode ...bool) string {
- segs := seg.ModeSegment(bytes, searchMode...)
- return ToString(segs, searchMode...)
- }
- func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
- // 处理特殊情况
- if len(bytes) == 0 {
- // return []Segment{}
- return nil
- }
- // 划分字元
- text := splitTextToWords(bytes)
- return seg.segmentWords(text, searchMode)
- }
- func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
- // 搜索模式下该分词已无继续划分可能的情况
- if searchMode && len(text) == 1 {
- return nil
- }
- // jumpers 定义了每个字元处的向前跳转信息,
- // 包括这个跳转对应的分词,
- // 以及从文本段开始到该字元的最短路径值
- jumpers := make([]jumper, len(text))
- if seg.dict == nil {
- return nil
- }
- tokens := make([]*Token, seg.dict.maxTokenLen)
- for current := 0; current < len(text); current++ {
- // 找到前一个字元处的最短路径,以便计算后续路径值
- var baseDistance float32
- if current == 0 {
- // 当本字元在文本首部时,基础距离应该是零
- baseDistance = 0
- } else {
- baseDistance = jumpers[current-1].minDistance
- }
- // 寻找所有以当前字元开头的分词
- numTokens := seg.dict.lookupTokens(
- text[current:minInt(current+seg.dict.maxTokenLen, len(text))], tokens)
- // 对所有可能的分词,更新分词结束字元处的跳转信息
- for iToken := 0; iToken < numTokens; iToken++ {
- location := current + len(tokens[iToken].text) - 1
- if !searchMode || current != 0 || location != len(text)-1 {
- updateJumper(&jumpers[location], baseDistance, tokens[iToken])
- }
- }
- // 当前字元没有对应分词时补加一个伪分词
- if numTokens == 0 || len(tokens[0].text) > 1 {
- updateJumper(&jumpers[current], baseDistance,
- &Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
- }
- }
- // 从后向前扫描第一遍得到需要添加的分词数目
- numSeg := 0
- for index := len(text) - 1; index >= 0; {
- location := index - len(jumpers[index].token.text) + 1
- numSeg++
- index = location - 1
- }
- // 从后向前扫描第二遍添加分词到最终结果
- outputSegments := make([]Segment, numSeg)
- for index := len(text) - 1; index >= 0; {
- location := index - len(jumpers[index].token.text) + 1
- numSeg--
- outputSegments[numSeg].token = jumpers[index].token
- index = location - 1
- }
- // 计算各个分词的字节位置
- bytePosition := 0
- for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
- outputSegments[iSeg].start = bytePosition
- bytePosition += textSliceByteLen(outputSegments[iSeg].token.text)
- outputSegments[iSeg].end = bytePosition
- }
- return outputSegments
- }
- // updateJumper 更新跳转信息:
- // 1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况),或者
- // 2. 当该位置的当前最短路径大于新的最短路径时
- // 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率
- func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
- newDistance := baseDistance + token.distance
- if jumper.minDistance == 0 || jumper.minDistance > newDistance {
- jumper.minDistance = newDistance
- jumper.token = token
- }
- }
- // minInt 取两整数较小值
- func minInt(a, b int) int {
- if a > b {
- return b
- }
- return a
- }
- // maxInt 取两整数较大值
- func maxInt(a, b int) int {
- if a > b {
- return a
- }
- return b
- }
- // splitTextToWords 将文本划分成字元
- func splitTextToWords(text Text) []Text {
- output := make([]Text, 0, len(text)/3)
- current := 0
- inAlphanumeric := true
- alphanumericStart := 0
- for current < len(text) {
- r, size := utf8.DecodeRune(text[current:])
- if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
- // 当前是拉丁字母或数字(非中日韩文字)
- if !inAlphanumeric {
- alphanumericStart = current
- inAlphanumeric = true
- }
- } else {
- if inAlphanumeric {
- inAlphanumeric = false
- if current != 0 {
- output = append(output, toLower(text[alphanumericStart:current]))
- }
- }
- output = append(output, text[current:current+size])
- }
- current += size
- }
- // 处理最后一个字元是英文的情况
- if inAlphanumeric {
- if current != 0 {
- output = append(output, toLower(text[alphanumericStart:current]))
- }
- }
- return output
- }
- // toLower 将英文词转化为小写
- func toLower(text []byte) []byte {
- output := make([]byte, len(text))
- for i, t := range text {
- if t >= 'A' && t <= 'Z' {
- output[i] = t - 'A' + 'a'
- } else {
- output[i] = t
- }
- }
- return output
- }
|