// Copyright 2013 Hui Chen // Copyright 2016 ego authors // // Licensed under the Apache License, Version 2.0 (the "License"): you may // not use this file except in compliance with the License. You may obtain // a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the // License for the specific language governing permissions and limitations // under the License. /* package gse Go efficient text segmentation, Go 语言分词 */ package gse import ( "bufio" "fmt" "io" "log" "math" "os" "path" "runtime" "strconv" "strings" "unicode" "unicode/utf8" ) const ( version string = "v0.10.0.106, Danube River!" minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词 ) // GetVersion get the gse version func GetVersion() string { return version } // Segmenter 分词器结构体 type Segmenter struct { dict *Dictionary } // jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息 type jumper struct { minDistance float32 token *Token } // Dictionary 返回分词器使用的词典 func (seg *Segmenter) Dictionary() *Dictionary { return seg.dict } // getCurrentFilePath get current file path func getCurrentFilePath() string { _, filePath, _, _ := runtime.Caller(1) return filePath } // Read read the dict flie func (seg *Segmenter) Read(file string) error { log.Printf("Load the gse dictionary: \"%s\" ", file) dictFile, err := os.Open(file) if err != nil { log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err) return err } defer dictFile.Close() reader := bufio.NewReader(dictFile) var ( text string freqText string frequency int pos string ) // 逐行读入分词 line := 0 for { line++ size, fsErr := fmt.Fscanln(reader, &text, &freqText, &pos) if fsErr != nil { if fsErr == io.EOF { // End of file break } if size > 0 { log.Printf("File '%v' line \"%v\" read error: %v, skip", file, line, fsErr.Error()) } else { log.Printf("File '%v' line \"%v\" is empty, read error: %v, skip", file, line, fsErr.Error()) } } if size == 0 { // 文件结束或错误行 // break continue } else if size < 2 { // 无效行 continue } else if size == 2 { // 没有词性标注时设为空字符串 pos = "" } // 解析词频 var err error frequency, err = strconv.Atoi(freqText) if err != nil { continue } // 过滤频率太小的词 if frequency < minTokenFrequency { continue } // 过滤, 降低词频 if len([]rune(text)) < 2 { // continue frequency = 2 } // 将分词添加到字典中 words := splitTextToWords([]byte(text)) token := Token{text: words, frequency: frequency, pos: pos} seg.dict.addToken(token) } return nil } // DictPaths get the dict's paths func DictPaths(dictDir, filePath string) (files []string) { var dictPath string if filePath == "en" { return } if filePath == "zh" { dictPath = path.Join(dictDir, "dict/dictionary.txt") files = []string{dictPath} return } if filePath == "jp" { dictPath = path.Join(dictDir, "dict/jp/dict.txt") files = []string{dictPath} return } // if strings.Contains(filePath, ",") { fileName := strings.Split(filePath, ",") for i := 0; i < len(fileName); i++ { if fileName[i] == "jp" { dictPath = path.Join(dictDir, "dict/jp/dict.txt") } if fileName[i] == "zh" { dictPath = path.Join(dictDir, "dict/dictionary.txt") } // if str[i] == "ti" { // } dictName := fileName[i] != "en" && fileName[i] != "zh" && fileName[i] != "jp" && fileName[i] != "ti" if dictName { dictPath = fileName[i] } if dictPath != "" { files = append(files, dictPath) } } // } log.Println("Dict files path: ", files) return } // IsJp is jp char return true func IsJp(segText string) bool { for _, r := range segText { jp := unicode.Is(unicode.Scripts["Hiragana"], r) || unicode.Is(unicode.Scripts["Katakana"], r) if jp { return true } } return false } // SegToken add segmenter token func (seg *Segmenter) SegToken() { // 计算每个分词的路径值,路径值含义见 Token 结构体的注释 logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency))) for i := range seg.dict.tokens { token := &seg.dict.tokens[i] token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency))) } // 对每个分词进行细致划分,用于搜索引擎模式, // 该模式用法见 Token 结构体的注释。 for i := range seg.dict.tokens { token := &seg.dict.tokens[i] segments := seg.segmentWords(token.text, true) // 计算需要添加的子分词数目 numTokensToAdd := 0 for iToken := 0; iToken < len(segments); iToken++ { // if len(segments[iToken].token.text) > 1 { // 略去字元长度为一的分词 // TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候 if len(segments[iToken].token.text) > 0 { hasJp := false if len(segments[iToken].token.text) == 1 { segText := string(segments[iToken].token.text[0]) hasJp = IsJp(segText) } if !hasJp { numTokensToAdd++ } } } token.segments = make([]*Segment, numTokensToAdd) // 添加子分词 iSegmentsToAdd := 0 for iToken := 0; iToken < len(segments); iToken++ { // if len(segments[iToken].token.text) > 1 { if len(segments[iToken].token.text) > 0 { hasJp := false if len(segments[iToken].token.text) == 1 { segText := string(segments[iToken].token.text[0]) hasJp = IsJp(segText) } if !hasJp { token.segments[iSegmentsToAdd] = &segments[iToken] iSegmentsToAdd++ } } } } } // LoadDict load the dictionary from the file // // The format of the dictionary is (one for each participle): // participle text, frequency, part of speech // // Can load multiple dictionary files, the file name separated by "," // the front of the dictionary preferentially load the participle, // such as: "user_dictionary.txt,common_dictionary.txt" // When a participle appears both in the user dictionary and // in the `common dictionary`, the `user dictionary` is given priority. // // 从文件中载入词典 // // 可以载入多个词典文件,文件名用 "," 分隔,排在前面的词典优先载入分词,比如: // "用户词典.txt,通用词典.txt" // 当一个分词既出现在用户词典也出现在 `通用词典` 中,则优先使用 `用户词典`。 // // 词典的格式为(每个分词一行): // 分词文本 频率 词性 func (seg *Segmenter) LoadDict(files ...string) error { seg.dict = NewDict() var ( dictDir = path.Join(path.Dir(getCurrentFilePath()), "data") dictPath string // load bool ) if len(files) > 0 { dictFiles := DictPaths(dictDir, files[0]) if len(dictFiles) > 0 { // load = true // files = dictFiles for i := 0; i < len(dictFiles); i++ { err := seg.Read(dictFiles[i]) if err != nil { return err } } } } if len(files) == 0 { dictPath = path.Join(dictDir, "dict/dictionary.txt") // files = []string{dictPath} err := seg.Read(dictPath) if err != nil { return err } } // if files[0] != "" && files[0] != "en" && !load { // for _, file := range strings.Split(files[0], ",") { // // for _, file := range files { // err := seg.Read(file) // if err != nil { // return err // } // } // } seg.SegToken() log.Println("Gse dictionary loaded finished.") return nil } // Segment 对文本分词 // // 输入参数: // bytes UTF8 文本的字节数组 // // 输出: // []Segment 划分的分词 func (seg *Segmenter) Segment(bytes []byte) []Segment { return seg.internalSegment(bytes, false) } // ModeSegment segment using search mode if searchMode is true func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment { var mode bool if len(searchMode) > 0 { mode = searchMode[0] } return seg.internalSegment(bytes, mode) } // Slice use modeSegment segment retrun []string // using search mode if searchMode is true func (seg *Segmenter) Slice(bytes []byte, searchMode ...bool) []string { segs := seg.ModeSegment(bytes, searchMode...) return ToSlice(segs, searchMode...) } // Slice use modeSegment segment retrun string // using search mode if searchMode is true func (seg *Segmenter) String(bytes []byte, searchMode ...bool) string { segs := seg.ModeSegment(bytes, searchMode...) return ToString(segs, searchMode...) } func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment { // 处理特殊情况 if len(bytes) == 0 { // return []Segment{} return nil } // 划分字元 text := splitTextToWords(bytes) return seg.segmentWords(text, searchMode) } func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment { // 搜索模式下该分词已无继续划分可能的情况 if searchMode && len(text) == 1 { return nil } // jumpers 定义了每个字元处的向前跳转信息, // 包括这个跳转对应的分词, // 以及从文本段开始到该字元的最短路径值 jumpers := make([]jumper, len(text)) if seg.dict == nil { return nil } tokens := make([]*Token, seg.dict.maxTokenLen) for current := 0; current < len(text); current++ { // 找到前一个字元处的最短路径,以便计算后续路径值 var baseDistance float32 if current == 0 { // 当本字元在文本首部时,基础距离应该是零 baseDistance = 0 } else { baseDistance = jumpers[current-1].minDistance } // 寻找所有以当前字元开头的分词 numTokens := seg.dict.lookupTokens( text[current:minInt(current+seg.dict.maxTokenLen, len(text))], tokens) // 对所有可能的分词,更新分词结束字元处的跳转信息 for iToken := 0; iToken < numTokens; iToken++ { location := current + len(tokens[iToken].text) - 1 if !searchMode || current != 0 || location != len(text)-1 { updateJumper(&jumpers[location], baseDistance, tokens[iToken]) } } // 当前字元没有对应分词时补加一个伪分词 if numTokens == 0 || len(tokens[0].text) > 1 { updateJumper(&jumpers[current], baseDistance, &Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"}) } } // 从后向前扫描第一遍得到需要添加的分词数目 numSeg := 0 for index := len(text) - 1; index >= 0; { location := index - len(jumpers[index].token.text) + 1 numSeg++ index = location - 1 } // 从后向前扫描第二遍添加分词到最终结果 outputSegments := make([]Segment, numSeg) for index := len(text) - 1; index >= 0; { location := index - len(jumpers[index].token.text) + 1 numSeg-- outputSegments[numSeg].token = jumpers[index].token index = location - 1 } // 计算各个分词的字节位置 bytePosition := 0 for iSeg := 0; iSeg < len(outputSegments); iSeg++ { outputSegments[iSeg].start = bytePosition bytePosition += textSliceByteLen(outputSegments[iSeg].token.text) outputSegments[iSeg].end = bytePosition } return outputSegments } // updateJumper 更新跳转信息: // 1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况),或者 // 2. 当该位置的当前最短路径大于新的最短路径时 // 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率 func updateJumper(jumper *jumper, baseDistance float32, token *Token) { newDistance := baseDistance + token.distance if jumper.minDistance == 0 || jumper.minDistance > newDistance { jumper.minDistance = newDistance jumper.token = token } } // minInt 取两整数较小值 func minInt(a, b int) int { if a > b { return b } return a } // maxInt 取两整数较大值 func maxInt(a, b int) int { if a > b { return a } return b } // splitTextToWords 将文本划分成字元 func splitTextToWords(text Text) []Text { output := make([]Text, 0, len(text)/3) current := 0 inAlphanumeric := true alphanumericStart := 0 for current < len(text) { r, size := utf8.DecodeRune(text[current:]) if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) { // 当前是拉丁字母或数字(非中日韩文字) if !inAlphanumeric { alphanumericStart = current inAlphanumeric = true } } else { if inAlphanumeric { inAlphanumeric = false if current != 0 { output = append(output, toLower(text[alphanumericStart:current])) } } output = append(output, text[current:current+size]) } current += size } // 处理最后一个字元是英文的情况 if inAlphanumeric { if current != 0 { output = append(output, toLower(text[alphanumericStart:current])) } } return output } // toLower 将英文词转化为小写 func toLower(text []byte) []byte { output := make([]byte, len(text)) for i, t := range text { if t >= 'A' && t <= 'Z' { output[i] = t - 'A' + 'a' } else { output[i] = t } } return output }