You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

63 lines
1.1 KiB

  1. package tagengine
  2. import (
  3. "sort"
  4. "strings"
  5. )
  6. var ignoreTokens = map[string]struct{}{}
  7. func init() {
  8. // These on their own are ignored.
  9. tokens := []string{
  10. "`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
  11. `-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
  12. `:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
  13. }
  14. for _, s := range tokens {
  15. ignoreTokens[s] = struct{}{}
  16. }
  17. }
  18. func Tokenize(
  19. input string,
  20. maxNgram int,
  21. ) (
  22. tokens []string,
  23. ) {
  24. // Avoid duplicate ngrams.
  25. ignored := map[string]bool{}
  26. fields := strings.Fields(input)
  27. if len(fields) < maxNgram {
  28. maxNgram = len(fields)
  29. }
  30. for i := 1; i < maxNgram+1; i++ {
  31. jMax := len(fields) - i + 1
  32. for j := 0; j < jMax; j++ {
  33. ngram := strings.Join(fields[j:i+j], " ")
  34. if _, ok := ignoreTokens[ngram]; !ok {
  35. if _, ok := ignored[ngram]; !ok {
  36. tokens = append(tokens, ngram)
  37. ignored[ngram] = true
  38. }
  39. }
  40. }
  41. }
  42. sortTokens(tokens)
  43. return tokens
  44. }
  45. func sortTokens(tokens []string) {
  46. sort.Slice(tokens, func(i, j int) bool {
  47. if len(tokens[i]) != len(tokens[j]) {
  48. return len(tokens[i]) < len(tokens[j])
  49. }
  50. return tokens[i] < tokens[j]
  51. })
  52. }