segment.go 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349
  1. // Copyright 2013 Hui Chen
  2. // Copyright 2016 ego authors
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License"): you may
  5. // not use this file except in compliance with the License. You may obtain
  6. // a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  12. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  13. // License for the specific language governing permissions and limitations
  14. // under the License.
  15. package riot
  16. import (
  17. // "fmt"
  18. "strings"
  19. "github.com/go-ego/gpy"
  20. "github.com/go-ego/riot/types"
  21. )
  22. // TMap defines the tokens map type map[string][]int
  23. type TMap map[string][]int
  24. type segmenterReq struct {
  25. docId uint64
  26. hash uint32
  27. data types.DocData
  28. // data types.DocumentIndexData
  29. forceUpdate bool
  30. }
  31. // ForSplitData for split segment's data, segspl
  32. func (engine *Engine) ForSplitData(strData []string, num int) (TMap, int) {
  33. var (
  34. numTokens int
  35. splitStr string
  36. )
  37. tokensMap := make(map[string][]int)
  38. for i := 0; i < num; i++ {
  39. if strData[i] != "" {
  40. if !engine.stopTokens.IsStopToken(strData[i]) {
  41. numTokens++
  42. tokensMap[strData[i]] = append(tokensMap[strData[i]], numTokens)
  43. }
  44. splitStr += strData[i]
  45. if !engine.stopTokens.IsStopToken(splitStr) {
  46. numTokens++
  47. tokensMap[splitStr] = append(tokensMap[splitStr], numTokens)
  48. }
  49. if engine.initOptions.Using == 6 {
  50. // more combination
  51. var splitsStr string
  52. for s := i + 1; s < len(strData); s++ {
  53. splitsStr += strData[s]
  54. if !engine.stopTokens.IsStopToken(splitsStr) {
  55. numTokens++
  56. tokensMap[splitsStr] = append(tokensMap[splitsStr], numTokens)
  57. }
  58. }
  59. }
  60. }
  61. }
  62. return tokensMap, numTokens
  63. }
  64. func (engine *Engine) splitData(request segmenterReq) (TMap, int) {
  65. var (
  66. num int
  67. numTokens int
  68. )
  69. tokensMap := make(map[string][]int)
  70. if request.data.Content != "" {
  71. content := strings.ToLower(request.data.Content)
  72. if engine.initOptions.Using == 3 {
  73. // use segmenter
  74. segments := engine.segmenter.ModeSegment([]byte(content),
  75. engine.initOptions.GseMode)
  76. for _, segment := range segments {
  77. token := segment.Token().Text()
  78. if !engine.stopTokens.IsStopToken(token) {
  79. tokensMap[token] = append(tokensMap[token], segment.Start())
  80. }
  81. }
  82. numTokens += len(segments)
  83. }
  84. if engine.initOptions.Using == 4 {
  85. tokensMap, numTokens = engine.defaultTokens(content)
  86. }
  87. if engine.initOptions.Using != 4 {
  88. strData := strings.Split(content, "")
  89. num = len(strData)
  90. tokenMap, numToken := engine.ForSplitData(strData, num)
  91. numTokens += numToken
  92. for key, val := range tokenMap {
  93. tokensMap[key] = val
  94. }
  95. }
  96. }
  97. for _, t := range request.data.Tokens {
  98. if !engine.stopTokens.IsStopToken(t.Text) {
  99. tokensMap[t.Text] = t.Locations
  100. }
  101. }
  102. numTokens += len(request.data.Tokens)
  103. return tokensMap, numTokens
  104. }
  105. func (engine *Engine) segmenterData(request segmenterReq) (TMap, int) {
  106. tokensMap := make(map[string][]int)
  107. numTokens := 0
  108. if engine.initOptions.Using == 0 && request.data.Content != "" {
  109. // Content 分词, 当文档正文不为空时,优先从内容分词中得到关键词
  110. segments := engine.segmenter.ModeSegment([]byte(request.data.Content),
  111. engine.initOptions.GseMode)
  112. for _, segment := range segments {
  113. token := segment.Token().Text()
  114. if !engine.stopTokens.IsStopToken(token) {
  115. tokensMap[token] = append(tokensMap[token], segment.Start())
  116. }
  117. }
  118. for _, t := range request.data.Tokens {
  119. if !engine.stopTokens.IsStopToken(t.Text) {
  120. tokensMap[t.Text] = t.Locations
  121. }
  122. }
  123. numTokens = len(segments) + len(request.data.Tokens)
  124. return tokensMap, numTokens
  125. }
  126. if engine.initOptions.Using == 1 && request.data.Content != "" {
  127. // Content 分词, 当文档正文不为空时,优先从内容分词中得到关键词
  128. segments := engine.segmenter.ModeSegment([]byte(request.data.Content),
  129. engine.initOptions.GseMode)
  130. for _, segment := range segments {
  131. token := segment.Token().Text()
  132. if !engine.stopTokens.IsStopToken(token) {
  133. tokensMap[token] = append(tokensMap[token], segment.Start())
  134. }
  135. }
  136. numTokens = len(segments)
  137. return tokensMap, numTokens
  138. }
  139. if engine.initOptions.Using == 2 ||
  140. ((engine.initOptions.Using == 1 || engine.initOptions.Using == 3) &&
  141. request.data.Content == "") {
  142. for _, t := range request.data.Tokens {
  143. if !engine.stopTokens.IsStopToken(t.Text) {
  144. tokensMap[t.Text] = t.Locations
  145. }
  146. }
  147. numTokens = len(request.data.Tokens)
  148. return tokensMap, numTokens
  149. }
  150. tokenMap, lenSplitData := engine.splitData(request)
  151. return tokenMap, lenSplitData
  152. }
  153. func (engine *Engine) defaultTokens(content string) (tokensMap TMap, numTokens int) {
  154. // use segmenter
  155. tokensMap = make(map[string][]int)
  156. strData := strings.Split(content, " ")
  157. num := len(strData)
  158. // if num == 1 {
  159. // tokensMap[request.data.Content] = []int{1}
  160. // }
  161. if num > 0 {
  162. tokenMap, numToken := engine.ForSplitData(strData, num)
  163. numTokens += numToken
  164. for key, val := range tokenMap {
  165. tokensMap[key] = val
  166. }
  167. }
  168. return
  169. }
  170. func (engine *Engine) segmenterWorker() {
  171. for {
  172. request := <-engine.segmenterChan
  173. if request.docId == 0 {
  174. if request.forceUpdate {
  175. for i := 0; i < engine.initOptions.NumShards; i++ {
  176. engine.indexerAddDocChans[i] <- indexerAddDocReq{
  177. forceUpdate: true}
  178. }
  179. }
  180. continue
  181. }
  182. shard := engine.getShard(request.hash)
  183. tokensMap := make(map[string][]int)
  184. numTokens := 0
  185. if !(engine.initOptions.NotUseGse && engine.initOptions.Using == 0) {
  186. tokensMap, numTokens = engine.segmenterData(request)
  187. } else {
  188. if request.data.Content != "" {
  189. content := strings.ToLower(request.data.Content)
  190. tokensMap, numTokens = engine.defaultTokens(content)
  191. }
  192. for _, t := range request.data.Tokens {
  193. if !engine.stopTokens.IsStopToken(t.Text) {
  194. tokensMap[t.Text] = t.Locations
  195. }
  196. }
  197. numTokens += len(request.data.Tokens)
  198. }
  199. // 加入非分词的文档标签
  200. for _, label := range request.data.Labels {
  201. if !engine.initOptions.NotUseGse {
  202. if !engine.stopTokens.IsStopToken(label) {
  203. // 当正文中已存在关键字时,若不判断,位置信息将会丢失
  204. if _, ok := tokensMap[label]; !ok {
  205. tokensMap[label] = []int{}
  206. }
  207. }
  208. } else {
  209. // 当正文中已存在关键字时,若不判断,位置信息将会丢失
  210. if _, ok := tokensMap[label]; !ok {
  211. tokensMap[label] = []int{}
  212. }
  213. }
  214. }
  215. indexerRequest := indexerAddDocReq{
  216. doc: &types.DocIndex{
  217. DocId: request.docId,
  218. TokenLen: float32(numTokens),
  219. Keywords: make([]types.KeywordIndex, len(tokensMap)),
  220. },
  221. forceUpdate: request.forceUpdate,
  222. }
  223. iTokens := 0
  224. for k, v := range tokensMap {
  225. indexerRequest.doc.Keywords[iTokens] = types.KeywordIndex{
  226. Text: k,
  227. // 非分词标注的词频设置为0,不参与tf-idf计算
  228. Frequency: float32(len(v)),
  229. Starts: v}
  230. iTokens++
  231. }
  232. engine.indexerAddDocChans[shard] <- indexerRequest
  233. if request.forceUpdate {
  234. for i := 0; i < engine.initOptions.NumShards; i++ {
  235. if i == shard {
  236. continue
  237. }
  238. engine.indexerAddDocChans[i] <- indexerAddDocReq{forceUpdate: true}
  239. }
  240. }
  241. rankerRequest := rankerAddDocReq{
  242. // docId: request.docId, fields: request.data.Fields}
  243. docId: request.docId, fields: request.data.Fields,
  244. content: request.data.Content, attri: request.data.Attri}
  245. engine.rankerAddDocChans[shard] <- rankerRequest
  246. }
  247. }
  248. // PinYin get the Chinese alphabet and abbreviation
  249. func (engine *Engine) PinYin(hans string) []string {
  250. var (
  251. str string
  252. pyStr string
  253. strArr []string
  254. splitStr string
  255. // splitArr []string
  256. )
  257. //
  258. splitHans := strings.Split(hans, "")
  259. for i := 0; i < len(splitHans); i++ {
  260. if splitHans[i] != "" {
  261. if !engine.stopTokens.IsStopToken(splitHans[i]) {
  262. strArr = append(strArr, splitHans[i])
  263. }
  264. splitStr += splitHans[i]
  265. }
  266. if !engine.stopTokens.IsStopToken(splitStr) {
  267. strArr = append(strArr, splitStr)
  268. }
  269. }
  270. // Segment 分词
  271. if !engine.initOptions.NotUseGse {
  272. sehans := engine.Segment(hans)
  273. for h := 0; h < len(sehans); h++ {
  274. if !engine.stopTokens.IsStopToken(sehans[h]) {
  275. strArr = append(strArr, sehans[h])
  276. }
  277. }
  278. }
  279. //
  280. // py := pinyin.LazyConvert(sehans[h], nil)
  281. py := gpy.LazyConvert(hans, nil)
  282. // log.Println("py...", py)
  283. for i := 0; i < len(py); i++ {
  284. // log.Println("py[i]...", py[i])
  285. pyStr += py[i]
  286. if !engine.stopTokens.IsStopToken(pyStr) {
  287. strArr = append(strArr, pyStr)
  288. }
  289. if len(py[i]) > 0 {
  290. str += py[i][0:1]
  291. if !engine.stopTokens.IsStopToken(str) {
  292. strArr = append(strArr, str)
  293. }
  294. }
  295. }
  296. return strArr
  297. }