|
|
- package tagengine
-
- import (
- "sort"
- "strings"
- )
-
- var ignoreTokens = map[string]struct{}{}
-
- func init() {
- // These on their own are ignored.
- tokens := []string{
- "`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
- `-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
- `:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
- }
- for _, s := range tokens {
- ignoreTokens[s] = struct{}{}
- }
- }
-
- func Tokenize(
- input string,
- maxNgram int,
- ) (
- tokens []string,
- ) {
- // Avoid duplicate ngrams.
- ignored := map[string]bool{}
-
- fields := strings.Fields(input)
-
- if len(fields) < maxNgram {
- maxNgram = len(fields)
- }
-
- for i := 1; i < maxNgram+1; i++ {
- jMax := len(fields) - i + 1
-
- for j := 0; j < jMax; j++ {
- ngram := strings.Join(fields[j:i+j], " ")
- if _, ok := ignoreTokens[ngram]; !ok {
- if _, ok := ignored[ngram]; !ok {
- tokens = append(tokens, ngram)
- ignored[ngram] = true
- }
- }
- }
- }
-
- sortTokens(tokens)
-
- return tokens
- }
-
- func sortTokens(tokens []string) {
- sort.Slice(tokens, func(i, j int) bool {
- if len(tokens[i]) != len(tokens[j]) {
- return len(tokens[i]) < len(tokens[j])
- }
- return tokens[i] < tokens[j]
- })
- }
|