123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349 |
- // Copyright 2013 Hui Chen
- // Copyright 2016 ego authors
- //
- // Licensed under the Apache License, Version 2.0 (the "License"): you may
- // not use this file except in compliance with the License. You may obtain
- // a copy of the License at
- //
- // http://www.apache.org/licenses/LICENSE-2.0
- //
- // Unless required by applicable law or agreed to in writing, software
- // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
- // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
- // License for the specific language governing permissions and limitations
- // under the License.
- package riot
- import (
- // "fmt"
- "strings"
- "github.com/go-ego/gpy"
- "github.com/go-ego/riot/types"
- )
- // TMap defines the tokens map type map[string][]int
- type TMap map[string][]int
- type segmenterReq struct {
- docId uint64
- hash uint32
- data types.DocData
- // data types.DocumentIndexData
- forceUpdate bool
- }
- // ForSplitData for split segment's data, segspl
- func (engine *Engine) ForSplitData(strData []string, num int) (TMap, int) {
- var (
- numTokens int
- splitStr string
- )
- tokensMap := make(map[string][]int)
- for i := 0; i < num; i++ {
- if strData[i] != "" {
- if !engine.stopTokens.IsStopToken(strData[i]) {
- numTokens++
- tokensMap[strData[i]] = append(tokensMap[strData[i]], numTokens)
- }
- splitStr += strData[i]
- if !engine.stopTokens.IsStopToken(splitStr) {
- numTokens++
- tokensMap[splitStr] = append(tokensMap[splitStr], numTokens)
- }
- if engine.initOptions.Using == 6 {
- // more combination
- var splitsStr string
- for s := i + 1; s < len(strData); s++ {
- splitsStr += strData[s]
- if !engine.stopTokens.IsStopToken(splitsStr) {
- numTokens++
- tokensMap[splitsStr] = append(tokensMap[splitsStr], numTokens)
- }
- }
- }
- }
- }
- return tokensMap, numTokens
- }
- func (engine *Engine) splitData(request segmenterReq) (TMap, int) {
- var (
- num int
- numTokens int
- )
- tokensMap := make(map[string][]int)
- if request.data.Content != "" {
- content := strings.ToLower(request.data.Content)
- if engine.initOptions.Using == 3 {
- // use segmenter
- segments := engine.segmenter.ModeSegment([]byte(content),
- engine.initOptions.GseMode)
- for _, segment := range segments {
- token := segment.Token().Text()
- if !engine.stopTokens.IsStopToken(token) {
- tokensMap[token] = append(tokensMap[token], segment.Start())
- }
- }
- numTokens += len(segments)
- }
- if engine.initOptions.Using == 4 {
- tokensMap, numTokens = engine.defaultTokens(content)
- }
- if engine.initOptions.Using != 4 {
- strData := strings.Split(content, "")
- num = len(strData)
- tokenMap, numToken := engine.ForSplitData(strData, num)
- numTokens += numToken
- for key, val := range tokenMap {
- tokensMap[key] = val
- }
- }
- }
- for _, t := range request.data.Tokens {
- if !engine.stopTokens.IsStopToken(t.Text) {
- tokensMap[t.Text] = t.Locations
- }
- }
- numTokens += len(request.data.Tokens)
- return tokensMap, numTokens
- }
- func (engine *Engine) segmenterData(request segmenterReq) (TMap, int) {
- tokensMap := make(map[string][]int)
- numTokens := 0
- if engine.initOptions.Using == 0 && request.data.Content != "" {
- // Content 分词, 当文档正文不为空时,优先从内容分词中得到关键词
- segments := engine.segmenter.ModeSegment([]byte(request.data.Content),
- engine.initOptions.GseMode)
- for _, segment := range segments {
- token := segment.Token().Text()
- if !engine.stopTokens.IsStopToken(token) {
- tokensMap[token] = append(tokensMap[token], segment.Start())
- }
- }
- for _, t := range request.data.Tokens {
- if !engine.stopTokens.IsStopToken(t.Text) {
- tokensMap[t.Text] = t.Locations
- }
- }
- numTokens = len(segments) + len(request.data.Tokens)
- return tokensMap, numTokens
- }
- if engine.initOptions.Using == 1 && request.data.Content != "" {
- // Content 分词, 当文档正文不为空时,优先从内容分词中得到关键词
- segments := engine.segmenter.ModeSegment([]byte(request.data.Content),
- engine.initOptions.GseMode)
- for _, segment := range segments {
- token := segment.Token().Text()
- if !engine.stopTokens.IsStopToken(token) {
- tokensMap[token] = append(tokensMap[token], segment.Start())
- }
- }
- numTokens = len(segments)
- return tokensMap, numTokens
- }
- if engine.initOptions.Using == 2 ||
- ((engine.initOptions.Using == 1 || engine.initOptions.Using == 3) &&
- request.data.Content == "") {
- for _, t := range request.data.Tokens {
- if !engine.stopTokens.IsStopToken(t.Text) {
- tokensMap[t.Text] = t.Locations
- }
- }
- numTokens = len(request.data.Tokens)
- return tokensMap, numTokens
- }
- tokenMap, lenSplitData := engine.splitData(request)
- return tokenMap, lenSplitData
- }
- func (engine *Engine) defaultTokens(content string) (tokensMap TMap, numTokens int) {
- // use segmenter
- tokensMap = make(map[string][]int)
- strData := strings.Split(content, " ")
- num := len(strData)
- // if num == 1 {
- // tokensMap[request.data.Content] = []int{1}
- // }
- if num > 0 {
- tokenMap, numToken := engine.ForSplitData(strData, num)
- numTokens += numToken
- for key, val := range tokenMap {
- tokensMap[key] = val
- }
- }
- return
- }
- func (engine *Engine) segmenterWorker() {
- for {
- request := <-engine.segmenterChan
- if request.docId == 0 {
- if request.forceUpdate {
- for i := 0; i < engine.initOptions.NumShards; i++ {
- engine.indexerAddDocChans[i] <- indexerAddDocReq{
- forceUpdate: true}
- }
- }
- continue
- }
- shard := engine.getShard(request.hash)
- tokensMap := make(map[string][]int)
- numTokens := 0
- if !(engine.initOptions.NotUseGse && engine.initOptions.Using == 0) {
- tokensMap, numTokens = engine.segmenterData(request)
- } else {
- if request.data.Content != "" {
- content := strings.ToLower(request.data.Content)
- tokensMap, numTokens = engine.defaultTokens(content)
- }
- for _, t := range request.data.Tokens {
- if !engine.stopTokens.IsStopToken(t.Text) {
- tokensMap[t.Text] = t.Locations
- }
- }
- numTokens += len(request.data.Tokens)
- }
- // 加入非分词的文档标签
- for _, label := range request.data.Labels {
- if !engine.initOptions.NotUseGse {
- if !engine.stopTokens.IsStopToken(label) {
- // 当正文中已存在关键字时,若不判断,位置信息将会丢失
- if _, ok := tokensMap[label]; !ok {
- tokensMap[label] = []int{}
- }
- }
- } else {
- // 当正文中已存在关键字时,若不判断,位置信息将会丢失
- if _, ok := tokensMap[label]; !ok {
- tokensMap[label] = []int{}
- }
- }
- }
- indexerRequest := indexerAddDocReq{
- doc: &types.DocIndex{
- DocId: request.docId,
- TokenLen: float32(numTokens),
- Keywords: make([]types.KeywordIndex, len(tokensMap)),
- },
- forceUpdate: request.forceUpdate,
- }
- iTokens := 0
- for k, v := range tokensMap {
- indexerRequest.doc.Keywords[iTokens] = types.KeywordIndex{
- Text: k,
- // 非分词标注的词频设置为0,不参与tf-idf计算
- Frequency: float32(len(v)),
- Starts: v}
- iTokens++
- }
- engine.indexerAddDocChans[shard] <- indexerRequest
- if request.forceUpdate {
- for i := 0; i < engine.initOptions.NumShards; i++ {
- if i == shard {
- continue
- }
- engine.indexerAddDocChans[i] <- indexerAddDocReq{forceUpdate: true}
- }
- }
- rankerRequest := rankerAddDocReq{
- // docId: request.docId, fields: request.data.Fields}
- docId: request.docId, fields: request.data.Fields,
- content: request.data.Content, attri: request.data.Attri}
- engine.rankerAddDocChans[shard] <- rankerRequest
- }
- }
- // PinYin get the Chinese alphabet and abbreviation
- func (engine *Engine) PinYin(hans string) []string {
- var (
- str string
- pyStr string
- strArr []string
- splitStr string
- // splitArr []string
- )
- //
- splitHans := strings.Split(hans, "")
- for i := 0; i < len(splitHans); i++ {
- if splitHans[i] != "" {
- if !engine.stopTokens.IsStopToken(splitHans[i]) {
- strArr = append(strArr, splitHans[i])
- }
- splitStr += splitHans[i]
- }
- if !engine.stopTokens.IsStopToken(splitStr) {
- strArr = append(strArr, splitStr)
- }
- }
- // Segment 分词
- if !engine.initOptions.NotUseGse {
- sehans := engine.Segment(hans)
- for h := 0; h < len(sehans); h++ {
- if !engine.stopTokens.IsStopToken(sehans[h]) {
- strArr = append(strArr, sehans[h])
- }
- }
- }
- //
- // py := pinyin.LazyConvert(sehans[h], nil)
- py := gpy.LazyConvert(hans, nil)
- // log.Println("py...", py)
- for i := 0; i < len(py); i++ {
- // log.Println("py[i]...", py[i])
- pyStr += py[i]
- if !engine.stopTokens.IsStopToken(pyStr) {
- strArr = append(strArr, pyStr)
- }
- if len(py[i]) > 0 {
- str += py[i][0:1]
- if !engine.stopTokens.IsStopToken(str) {
- strArr = append(strArr, str)
- }
- }
- }
- return strArr
- }
|