64 lines
1.1 KiB
Go
64 lines
1.1 KiB
Go
package tagengine
|
|
|
|
import (
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
var ignoreTokens = map[string]struct{}{}
|
|
|
|
func init() {
|
|
// These on their own are ignored.
|
|
tokens := []string{
|
|
"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
|
|
`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
|
|
`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
|
|
}
|
|
for _, s := range tokens {
|
|
ignoreTokens[s] = struct{}{}
|
|
}
|
|
}
|
|
|
|
func Tokenize(
|
|
input string,
|
|
maxNgram int,
|
|
) (
|
|
tokens []string,
|
|
) {
|
|
// Avoid duplicate ngrams.
|
|
ignored := map[string]bool{}
|
|
|
|
fields := strings.Fields(input)
|
|
|
|
if len(fields) < maxNgram {
|
|
maxNgram = len(fields)
|
|
}
|
|
|
|
for i := 1; i < maxNgram+1; i++ {
|
|
jMax := len(fields) - i + 1
|
|
|
|
for j := 0; j < jMax; j++ {
|
|
ngram := strings.Join(fields[j:i+j], " ")
|
|
if _, ok := ignoreTokens[ngram]; !ok {
|
|
if _, ok := ignored[ngram]; !ok {
|
|
tokens = append(tokens, ngram)
|
|
ignored[ngram] = true
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
sortTokens(tokens)
|
|
|
|
return tokens
|
|
}
|
|
|
|
func sortTokens(tokens []string) {
|
|
sort.Slice(tokens, func(i, j int) bool {
|
|
if len(tokens[i]) != len(tokens[j]) {
|
|
return len(tokens[i]) < len(tokens[j])
|
|
}
|
|
return tokens[i] < tokens[j]
|
|
})
|
|
}
|