// Copyright (c) 2014 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package analysis import ( "reflect" "github.com/blevesearch/bleve/size" ) var reflectStaticSizeTokenLocation int var reflectStaticSizeTokenFreq int func init() { var tl TokenLocation reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size()) var tf TokenFreq reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size()) } // TokenLocation represents one occurrence of a term at a particular location in // a field. Start, End and Position have the same meaning as in analysis.Token. // Field and ArrayPositions identify the field value in the source document. // See document.Field for details. type TokenLocation struct { Field string ArrayPositions []uint64 Start int End int Position int } func (tl *TokenLocation) Size() int { rv := reflectStaticSizeTokenLocation rv += len(tl.ArrayPositions) * size.SizeOfUint64 return rv } // TokenFreq represents all the occurrences of a term in all fields of a // document. type TokenFreq struct { Term []byte Locations []*TokenLocation frequency int } func (tf *TokenFreq) Size() int { rv := reflectStaticSizeTokenFreq rv += len(tf.Term) for _, loc := range tf.Locations { rv += loc.Size() } return rv } func (tf *TokenFreq) Frequency() int { return tf.frequency } // TokenFrequencies maps document terms to their combined frequencies from all // fields. type TokenFrequencies map[string]*TokenFreq func (tfs TokenFrequencies) Size() int { rv := size.SizeOfMap rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr) for k, v := range tfs { rv += len(k) rv += v.Size() } return rv } func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { // walk the new token frequencies for tfk, tf := range other { // set the remoteField value in incoming token freqs for _, l := range tf.Locations { l.Field = remoteField } existingTf, exists := tfs[tfk] if exists { existingTf.Locations = append(existingTf.Locations, tf.Locations...) existingTf.frequency = existingTf.frequency + tf.frequency } else { tfs[tfk] = &TokenFreq{ Term: tf.Term, frequency: tf.frequency, Locations: make([]*TokenLocation, len(tf.Locations)), } copy(tfs[tfk].Locations, tf.Locations) } } } func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies { rv := make(map[string]*TokenFreq, len(tokens)) if includeTermVectors { tls := make([]TokenLocation, len(tokens)) tlNext := 0 for _, token := range tokens { tls[tlNext] = TokenLocation{ ArrayPositions: arrayPositions, Start: token.Start, End: token.End, Position: token.Position, } curr, ok := rv[string(token.Term)] if ok { curr.Locations = append(curr.Locations, &tls[tlNext]) curr.frequency++ } else { rv[string(token.Term)] = &TokenFreq{ Term: token.Term, Locations: []*TokenLocation{&tls[tlNext]}, frequency: 1, } } tlNext++ } } else { for _, token := range tokens { curr, exists := rv[string(token.Term)] if exists { curr.frequency++ } else { rv[string(token.Term)] = &TokenFreq{ Term: token.Term, frequency: 1, } } } } return rv }