segmenter.go 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. // Copyright 2013 Hui Chen
  2. // Copyright 2016 ego authors
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"): you may
  5. // not use this file except in compliance with the License. You may obtain
  6. // a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. /*
  16. package gse Go efficient text segmentation, Go 语言分词
  17. */
  18. package gse
  19. import (
  20. "bufio"
  21. "fmt"
  22. "io"
  23. "log"
  24. "math"
  25. "os"
  26. "path"
  27. "runtime"
  28. "strconv"
  29. "strings"
  30. "unicode"
  31. "unicode/utf8"
  32. )
  33. const (
  34. version string = "v0.10.0.106, Danube River!"
  35. minTokenFrequency = 2 // 仅从字典文件中读取大于等于此频率的分词
  36. )
  37. // GetVersion get the gse version
  38. func GetVersion() string {
  39. return version
  40. }
  41. // Segmenter 分词器结构体
  42. type Segmenter struct {
  43. dict *Dictionary
  44. }
  45. // jumper 该结构体用于记录 Viterbi 算法中某字元处的向前分词跳转信息
  46. type jumper struct {
  47. minDistance float32
  48. token *Token
  49. }
  50. // Dictionary 返回分词器使用的词典
  51. func (seg *Segmenter) Dictionary() *Dictionary {
  52. return seg.dict
  53. }
  54. // getCurrentFilePath get current file path
  55. func getCurrentFilePath() string {
  56. _, filePath, _, _ := runtime.Caller(1)
  57. return filePath
  58. }
  59. // Read read the dict flie
  60. func (seg *Segmenter) Read(file string) error {
  61. log.Printf("Load the gse dictionary: \"%s\" ", file)
  62. dictFile, err := os.Open(file)
  63. if err != nil {
  64. log.Printf("Could not load dictionaries: \"%s\", %v \n", file, err)
  65. return err
  66. }
  67. defer dictFile.Close()
  68. reader := bufio.NewReader(dictFile)
  69. var (
  70. text string
  71. freqText string
  72. frequency int
  73. pos string
  74. )
  75. // 逐行读入分词
  76. line := 0
  77. for {
  78. line++
  79. size, fsErr := fmt.Fscanln(reader, &text, &freqText, &pos)
  80. if fsErr != nil {
  81. if fsErr == io.EOF {
  82. // End of file
  83. break
  84. }
  85. if size > 0 {
  86. log.Printf("File '%v' line \"%v\" read error: %v, skip",
  87. file, line, fsErr.Error())
  88. } else {
  89. log.Printf("File '%v' line \"%v\" is empty, read error: %v, skip",
  90. file, line, fsErr.Error())
  91. }
  92. }
  93. if size == 0 {
  94. // 文件结束或错误行
  95. // break
  96. continue
  97. } else if size < 2 {
  98. // 无效行
  99. continue
  100. } else if size == 2 {
  101. // 没有词性标注时设为空字符串
  102. pos = ""
  103. }
  104. // 解析词频
  105. var err error
  106. frequency, err = strconv.Atoi(freqText)
  107. if err != nil {
  108. continue
  109. }
  110. // 过滤频率太小的词
  111. if frequency < minTokenFrequency {
  112. continue
  113. }
  114. // 过滤, 降低词频
  115. if len([]rune(text)) < 2 {
  116. // continue
  117. frequency = 2
  118. }
  119. // 将分词添加到字典中
  120. words := splitTextToWords([]byte(text))
  121. token := Token{text: words, frequency: frequency, pos: pos}
  122. seg.dict.addToken(token)
  123. }
  124. return nil
  125. }
  126. // DictPaths get the dict's paths
  127. func DictPaths(dictDir, filePath string) (files []string) {
  128. var dictPath string
  129. if filePath == "en" {
  130. return
  131. }
  132. if filePath == "zh" {
  133. dictPath = path.Join(dictDir, "dict/dictionary.txt")
  134. files = []string{dictPath}
  135. return
  136. }
  137. if filePath == "jp" {
  138. dictPath = path.Join(dictDir, "dict/jp/dict.txt")
  139. files = []string{dictPath}
  140. return
  141. }
  142. // if strings.Contains(filePath, ",") {
  143. fileName := strings.Split(filePath, ",")
  144. for i := 0; i < len(fileName); i++ {
  145. if fileName[i] == "jp" {
  146. dictPath = path.Join(dictDir, "dict/jp/dict.txt")
  147. }
  148. if fileName[i] == "zh" {
  149. dictPath = path.Join(dictDir, "dict/dictionary.txt")
  150. }
  151. // if str[i] == "ti" {
  152. // }
  153. dictName := fileName[i] != "en" && fileName[i] != "zh" &&
  154. fileName[i] != "jp" && fileName[i] != "ti"
  155. if dictName {
  156. dictPath = fileName[i]
  157. }
  158. if dictPath != "" {
  159. files = append(files, dictPath)
  160. }
  161. }
  162. // }
  163. log.Println("Dict files path: ", files)
  164. return
  165. }
  166. // IsJp is jp char return true
  167. func IsJp(segText string) bool {
  168. for _, r := range segText {
  169. jp := unicode.Is(unicode.Scripts["Hiragana"], r) ||
  170. unicode.Is(unicode.Scripts["Katakana"], r)
  171. if jp {
  172. return true
  173. }
  174. }
  175. return false
  176. }
  177. // SegToken add segmenter token
  178. func (seg *Segmenter) SegToken() {
  179. // 计算每个分词的路径值,路径值含义见 Token 结构体的注释
  180. logTotalFrequency := float32(math.Log2(float64(seg.dict.totalFrequency)))
  181. for i := range seg.dict.tokens {
  182. token := &seg.dict.tokens[i]
  183. token.distance = logTotalFrequency - float32(math.Log2(float64(token.frequency)))
  184. }
  185. // 对每个分词进行细致划分,用于搜索引擎模式,
  186. // 该模式用法见 Token 结构体的注释。
  187. for i := range seg.dict.tokens {
  188. token := &seg.dict.tokens[i]
  189. segments := seg.segmentWords(token.text, true)
  190. // 计算需要添加的子分词数目
  191. numTokensToAdd := 0
  192. for iToken := 0; iToken < len(segments); iToken++ {
  193. // if len(segments[iToken].token.text) > 1 {
  194. // 略去字元长度为一的分词
  195. // TODO: 这值得进一步推敲,特别是当字典中有英文复合词的时候
  196. if len(segments[iToken].token.text) > 0 {
  197. hasJp := false
  198. if len(segments[iToken].token.text) == 1 {
  199. segText := string(segments[iToken].token.text[0])
  200. hasJp = IsJp(segText)
  201. }
  202. if !hasJp {
  203. numTokensToAdd++
  204. }
  205. }
  206. }
  207. token.segments = make([]*Segment, numTokensToAdd)
  208. // 添加子分词
  209. iSegmentsToAdd := 0
  210. for iToken := 0; iToken < len(segments); iToken++ {
  211. // if len(segments[iToken].token.text) > 1 {
  212. if len(segments[iToken].token.text) > 0 {
  213. hasJp := false
  214. if len(segments[iToken].token.text) == 1 {
  215. segText := string(segments[iToken].token.text[0])
  216. hasJp = IsJp(segText)
  217. }
  218. if !hasJp {
  219. token.segments[iSegmentsToAdd] = &segments[iToken]
  220. iSegmentsToAdd++
  221. }
  222. }
  223. }
  224. }
  225. }
  226. // LoadDict load the dictionary from the file
  227. //
  228. // The format of the dictionary is (one for each participle):
  229. // participle text, frequency, part of speech
  230. //
  231. // Can load multiple dictionary files, the file name separated by ","
  232. // the front of the dictionary preferentially load the participle,
  233. // such as: "user_dictionary.txt,common_dictionary.txt"
  234. // When a participle appears both in the user dictionary and
  235. // in the `common dictionary`, the `user dictionary` is given priority.
  236. //
  237. // 从文件中载入词典
  238. //
  239. // 可以载入多个词典文件,文件名用 "," 分隔,排在前面的词典优先载入分词,比如:
  240. // "用户词典.txt,通用词典.txt"
  241. // 当一个分词既出现在用户词典也出现在 `通用词典` 中,则优先使用 `用户词典`。
  242. //
  243. // 词典的格式为(每个分词一行):
  244. // 分词文本 频率 词性
  245. func (seg *Segmenter) LoadDict(files ...string) error {
  246. seg.dict = NewDict()
  247. var (
  248. dictDir = path.Join(path.Dir(getCurrentFilePath()), "data")
  249. dictPath string
  250. // load bool
  251. )
  252. if len(files) > 0 {
  253. dictFiles := DictPaths(dictDir, files[0])
  254. if len(dictFiles) > 0 {
  255. // load = true
  256. // files = dictFiles
  257. for i := 0; i < len(dictFiles); i++ {
  258. err := seg.Read(dictFiles[i])
  259. if err != nil {
  260. return err
  261. }
  262. }
  263. }
  264. }
  265. if len(files) == 0 {
  266. dictPath = path.Join(dictDir, "dict/dictionary.txt")
  267. // files = []string{dictPath}
  268. err := seg.Read(dictPath)
  269. if err != nil {
  270. return err
  271. }
  272. }
  273. // if files[0] != "" && files[0] != "en" && !load {
  274. // for _, file := range strings.Split(files[0], ",") {
  275. // // for _, file := range files {
  276. // err := seg.Read(file)
  277. // if err != nil {
  278. // return err
  279. // }
  280. // }
  281. // }
  282. seg.SegToken()
  283. log.Println("Gse dictionary loaded finished.")
  284. return nil
  285. }
  286. // Segment 对文本分词
  287. //
  288. // 输入参数:
  289. // bytes UTF8 文本的字节数组
  290. //
  291. // 输出:
  292. // []Segment 划分的分词
  293. func (seg *Segmenter) Segment(bytes []byte) []Segment {
  294. return seg.internalSegment(bytes, false)
  295. }
  296. // ModeSegment segment using search mode if searchMode is true
  297. func (seg *Segmenter) ModeSegment(bytes []byte, searchMode ...bool) []Segment {
  298. var mode bool
  299. if len(searchMode) > 0 {
  300. mode = searchMode[0]
  301. }
  302. return seg.internalSegment(bytes, mode)
  303. }
  304. // Slice use modeSegment segment retrun []string
  305. // using search mode if searchMode is true
  306. func (seg *Segmenter) Slice(bytes []byte, searchMode ...bool) []string {
  307. segs := seg.ModeSegment(bytes, searchMode...)
  308. return ToSlice(segs, searchMode...)
  309. }
  310. // Slice use modeSegment segment retrun string
  311. // using search mode if searchMode is true
  312. func (seg *Segmenter) String(bytes []byte, searchMode ...bool) string {
  313. segs := seg.ModeSegment(bytes, searchMode...)
  314. return ToString(segs, searchMode...)
  315. }
  316. func (seg *Segmenter) internalSegment(bytes []byte, searchMode bool) []Segment {
  317. // 处理特殊情况
  318. if len(bytes) == 0 {
  319. // return []Segment{}
  320. return nil
  321. }
  322. // 划分字元
  323. text := splitTextToWords(bytes)
  324. return seg.segmentWords(text, searchMode)
  325. }
  326. func (seg *Segmenter) segmentWords(text []Text, searchMode bool) []Segment {
  327. // 搜索模式下该分词已无继续划分可能的情况
  328. if searchMode && len(text) == 1 {
  329. return nil
  330. }
  331. // jumpers 定义了每个字元处的向前跳转信息,
  332. // 包括这个跳转对应的分词,
  333. // 以及从文本段开始到该字元的最短路径值
  334. jumpers := make([]jumper, len(text))
  335. if seg.dict == nil {
  336. return nil
  337. }
  338. tokens := make([]*Token, seg.dict.maxTokenLen)
  339. for current := 0; current < len(text); current++ {
  340. // 找到前一个字元处的最短路径,以便计算后续路径值
  341. var baseDistance float32
  342. if current == 0 {
  343. // 当本字元在文本首部时,基础距离应该是零
  344. baseDistance = 0
  345. } else {
  346. baseDistance = jumpers[current-1].minDistance
  347. }
  348. // 寻找所有以当前字元开头的分词
  349. numTokens := seg.dict.lookupTokens(
  350. text[current:minInt(current+seg.dict.maxTokenLen, len(text))], tokens)
  351. // 对所有可能的分词,更新分词结束字元处的跳转信息
  352. for iToken := 0; iToken < numTokens; iToken++ {
  353. location := current + len(tokens[iToken].text) - 1
  354. if !searchMode || current != 0 || location != len(text)-1 {
  355. updateJumper(&jumpers[location], baseDistance, tokens[iToken])
  356. }
  357. }
  358. // 当前字元没有对应分词时补加一个伪分词
  359. if numTokens == 0 || len(tokens[0].text) > 1 {
  360. updateJumper(&jumpers[current], baseDistance,
  361. &Token{text: []Text{text[current]}, frequency: 1, distance: 32, pos: "x"})
  362. }
  363. }
  364. // 从后向前扫描第一遍得到需要添加的分词数目
  365. numSeg := 0
  366. for index := len(text) - 1; index >= 0; {
  367. location := index - len(jumpers[index].token.text) + 1
  368. numSeg++
  369. index = location - 1
  370. }
  371. // 从后向前扫描第二遍添加分词到最终结果
  372. outputSegments := make([]Segment, numSeg)
  373. for index := len(text) - 1; index >= 0; {
  374. location := index - len(jumpers[index].token.text) + 1
  375. numSeg--
  376. outputSegments[numSeg].token = jumpers[index].token
  377. index = location - 1
  378. }
  379. // 计算各个分词的字节位置
  380. bytePosition := 0
  381. for iSeg := 0; iSeg < len(outputSegments); iSeg++ {
  382. outputSegments[iSeg].start = bytePosition
  383. bytePosition += textSliceByteLen(outputSegments[iSeg].token.text)
  384. outputSegments[iSeg].end = bytePosition
  385. }
  386. return outputSegments
  387. }
  388. // updateJumper 更新跳转信息:
  389. // 1. 当该位置从未被访问过时 (jumper.minDistance 为零的情况),或者
  390. // 2. 当该位置的当前最短路径大于新的最短路径时
  391. // 将当前位置的最短路径值更新为 baseDistance 加上新分词的概率
  392. func updateJumper(jumper *jumper, baseDistance float32, token *Token) {
  393. newDistance := baseDistance + token.distance
  394. if jumper.minDistance == 0 || jumper.minDistance > newDistance {
  395. jumper.minDistance = newDistance
  396. jumper.token = token
  397. }
  398. }
  399. // minInt 取两整数较小值
  400. func minInt(a, b int) int {
  401. if a > b {
  402. return b
  403. }
  404. return a
  405. }
  406. // maxInt 取两整数较大值
  407. func maxInt(a, b int) int {
  408. if a > b {
  409. return a
  410. }
  411. return b
  412. }
  413. // splitTextToWords 将文本划分成字元
  414. func splitTextToWords(text Text) []Text {
  415. output := make([]Text, 0, len(text)/3)
  416. current := 0
  417. inAlphanumeric := true
  418. alphanumericStart := 0
  419. for current < len(text) {
  420. r, size := utf8.DecodeRune(text[current:])
  421. if size <= 2 && (unicode.IsLetter(r) || unicode.IsNumber(r)) {
  422. // 当前是拉丁字母或数字(非中日韩文字)
  423. if !inAlphanumeric {
  424. alphanumericStart = current
  425. inAlphanumeric = true
  426. }
  427. } else {
  428. if inAlphanumeric {
  429. inAlphanumeric = false
  430. if current != 0 {
  431. output = append(output, toLower(text[alphanumericStart:current]))
  432. }
  433. }
  434. output = append(output, text[current:current+size])
  435. }
  436. current += size
  437. }
  438. // 处理最后一个字元是英文的情况
  439. if inAlphanumeric {
  440. if current != 0 {
  441. output = append(output, toLower(text[alphanumericStart:current]))
  442. }
  443. }
  444. return output
  445. }
  446. // toLower 将英文词转化为小写
  447. func toLower(text []byte) []byte {
  448. output := make([]byte, len(text))
  449. for i, t := range text {
  450. if t >= 'A' && t <= 'Z' {
  451. output[i] = t - 'A' + 'a'
  452. } else {
  453. output[i] = t
  454. }
  455. }
  456. return output
  457. }