zardoz/vendor/github.com/lytics/multibayes/tokenize.go

167 lines
3.7 KiB
Go

package multibayes
import (
"bytes"
"encoding/base64"
"regexp"
"strings"
"github.com/blevesearch/bleve/analysis"
regexp_tokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp"
"github.com/blevesearch/go-porterstemmer"
)
const (
tokenSeparator = "_"
)
type ngram struct {
Tokens [][]byte
}
// encodes in base64 for safe comparison
func (ng *ngram) String() string {
encoded := make([]string, len(ng.Tokens))
for i, token := range ng.Tokens {
encoded[i] = string(token)
//encoded[i] = base64.StdEncoding.EncodeToString(token) // safer?
}
return strings.Join(encoded, tokenSeparator)
}
func decodeNGram(s string) (*ngram, error) {
encodedTokens := strings.Split(s, tokenSeparator)
tokens := make([][]byte, len(encodedTokens))
var err error
for i, encodedToken := range encodedTokens {
tokens[i], err = base64.StdEncoding.DecodeString(encodedToken)
if err != nil {
return nil, err
}
}
return &ngram{tokens}, nil
}
type tokenizerConf struct {
regexp *regexp.Regexp
NGramSize int64
}
type tokenizer struct {
regexp_tokenizer.RegexpTokenizer
Conf *tokenizerConf
}
func validateConf(tc *tokenizerConf) {
tc.regexp = regexp.MustCompile(`[0-9A-z_'\-]+|\%|\$`)
// TODO: We force NGramSize = 1 so as to create disjoint ngrams,
// which is necessary for the naive assumption of conditional
// independence among tokens. It would be great to allow ngrams
// to be greater than 1 and select only disjoint ngrams from the
// tokenizer.
tc.NGramSize = 1
}
func newTokenizer(tc *tokenizerConf) (*tokenizer, error) {
validateConf(tc)
return &tokenizer{*regexp_tokenizer.NewRegexpTokenizer(tc.regexp), tc}, nil
}
// Tokenize and Gramify
func (t *tokenizer) Parse(doc string) []ngram {
// maybe use token types for datetimes or something instead of
// the actual byte slice
alltokens := t.Tokenize([]byte(strings.ToLower(doc)))
filtered := make(map[int][]byte)
for i, token := range alltokens {
exclude := false
for _, stop := range stopbytes {
if bytes.Equal(token.Term, stop) {
exclude = true
break
}
}
if exclude {
continue
}
tokenString := porterstemmer.StemString(string(token.Term))
//tokenBytes := porterstemmer.Stem(token.Term) // takes runes, not bytes
if token.Type == analysis.Numeric {
tokenString = "NUMBER"
} else if token.Type == analysis.DateTime {
tokenString = "DATE"
}
filtered[i] = []byte(tokenString)
}
// only consider sequential terms as candidates for ngrams
// terms separated by stopwords are ineligible
allNGrams := make([]ngram, 0, 100)
currentTokens := make([][]byte, 0, 100)
lastObserved := -1
for i, token := range filtered {
if (i - 1) != lastObserved {
ngrams := t.tokensToNGrams(currentTokens)
allNGrams = append(allNGrams, ngrams...)
currentTokens = make([][]byte, 0, 100)
}
currentTokens = append(currentTokens, token)
lastObserved = i
}
// bring in the last one
if len(currentTokens) > 0 {
ngrams := t.tokensToNGrams(currentTokens)
allNGrams = append(allNGrams, ngrams...)
}
return allNGrams
}
func (t *tokenizer) tokensToNGrams(tokens [][]byte) []ngram {
nTokens := int64(len(tokens))
nNGrams := int64(0)
for i := int64(1); i <= t.Conf.NGramSize; i++ {
chosen := choose(nTokens, i)
nNGrams += chosen
}
ngrams := make([]ngram, 0, nNGrams)
for ngramSize := int64(1); ngramSize <= t.Conf.NGramSize; ngramSize++ {
nNGramsOfSize := choose(nTokens, ngramSize)
for i := int64(0); i < nNGramsOfSize; i++ {
ngrams = append(ngrams, ngram{tokens[i:(i + ngramSize)]})
}
}
return ngrams
}
// not a binomial coefficient -- combinations must be sequential
func choose(n, k int64) int64 {
return max(n-k+int64(1), 0)
}
func max(x, y int64) int64 {
if x > y {
return x
}
return y
}