Forked from subrubia

2021-09-09 12:25:53 +02:00 · 2021-09-09 12:25:53 +02:00 · 0a77a882f1
parent d04606923b
commit 0a77a882f1
14 changed files with 920 additions and 0 deletions
--- a/21
+++ b/21
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2020 Suburbia
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
+module git.crumpington.com/public/tagengine
+
+go 1.17
+
+require golang.org/x/text v0.3.7
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,3 @@
+golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
+golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/ngram.go
+++ b/ngram.go
@ -0,0 +1,30 @@
+package tagengine
+
+import "unicode"
+
+func ngramLength(s string) int {
+	N := len(s)
+	i := 0
+	count := 0
+
+	for {
+		// Eat spaces.
+		for i < N && unicode.IsSpace(rune(s[i])) {
+			i++
+		}
+
+		// Done?
+		if i == N {
+			break
+		}
+
+		// Non-space!
+		count++
+
+		// Eat non-spaces.
+		for i < N && !unicode.IsSpace(rune(s[i])) {
+			i++
+		}
+	}
+	return count
+}
--- a/ngram_test.go
+++ b/ngram_test.go
@ -0,0 +1,31 @@
+package tagengine
+
+import (
+	"log"
+	"testing"
+)
+
+func TestNGramLength(t *testing.T) {
+	type Case struct {
+		Input  string
+		Length int
+	}
+
+	cases := []Case{
+		{"a b c", 3},
+		{"  xyz\nlkj  dflaj a", 4},
+		{"a", 1},
+		{" a", 1},
+		{"a", 1},
+		{" a\n", 1},
+		{" a ", 1},
+		{"\tx\ny\nz q ", 4},
+	}
+
+	for _, tc := range cases {
+		length := ngramLength(tc.Input)
+		if length != tc.Length {
+			log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
+		}
+	}
+}
--- a/node.go
+++ b/node.go
@ -0,0 +1,79 @@
+package tagengine
+
+import (
+	"fmt"
+	"strings"
+)
+
+type node struct {
+	Token    string
+	Matches  []*Rule
+	Children map[string]*node
+}
+
+func (n *node) AddRule(r *Rule) {
+	n.addRule(r, 0)
+}
+
+func (n *node) addRule(r *Rule, idx int) {
+	if len(r.Includes) == idx {
+		n.Matches = append(n.Matches, r)
+		return
+	}
+
+	token := r.Includes[idx]
+
+	child, ok := n.Children[token]
+	if !ok {
+		child = &node{
+			Token:    token,
+			Children: map[string]*node{},
+		}
+		n.Children[token] = child
+	}
+
+	child.addRule(r, idx+1)
+}
+
+// Note that tokens must be sorted. This is the case for tokens created from
+// the tokenize function.
+func (n *node) Match(tokens []string) (rules []*Rule) {
+	return n.match(tokens, rules)
+}
+
+func (n *node) match(tokens []string, rules []*Rule) []*Rule {
+	// Check for a match.
+	if n.Matches != nil {
+		rules = append(rules, n.Matches...)
+	}
+
+	if len(tokens) == 0 {
+		return rules
+	}
+
+	// Attempt to match children.
+	for i := 0; i < len(tokens); i++ {
+		token := tokens[i]
+		if child, ok := n.Children[token]; ok {
+			rules = child.match(tokens[i+1:], rules)
+		}
+	}
+
+	return rules
+}
+
+func (n *node) Dump() {
+	n.dump(0)
+}
+
+func (n *node) dump(depth int) {
+	indent := strings.Repeat(" ", 2*depth)
+	tag := ""
+	for _, m := range n.Matches {
+		tag += " " + m.Tag
+	}
+	fmt.Printf("%s%s%s\n", indent, n.Token, tag)
+	for _, child := range n.Children {
+		child.dump(depth + 1)
+	}
+}
--- a/rule.go
+++ b/rule.go
@ -0,0 +1,151 @@
+package tagengine
+
+type Rule struct {
+	Tag      string
+	Includes []string
+	Excludes []string
+	Blocks   []string // List of blocked tags.
+
+	MatchCount int
+	FirstCount int
+
+	score int
+
+	excludes map[string]struct{}
+}
+
+func NewRule(tag string) Rule {
+	return Rule{Tag: tag}
+}
+
+func (r Rule) Inc(l ...string) Rule {
+	return Rule{
+		Tag:      r.Tag,
+		Includes: l,
+		Excludes: r.Excludes,
+		Blocks:   r.Blocks,
+	}
+}
+
+func (r Rule) Exc(l ...string) Rule {
+	return Rule{
+		Tag:      r.Tag,
+		Includes: r.Includes,
+		Excludes: l,
+		Blocks:   r.Blocks,
+	}
+}
+
+func (r Rule) Block(l ...string) Rule {
+	return Rule{
+		Tag:      r.Tag,
+		Includes: r.Includes,
+		Excludes: r.Excludes,
+		Blocks:   l,
+	}
+}
+
+func (rule *Rule) normalize() {
+	sanitize := newSanitizer()
+
+	for i, token := range rule.Includes {
+		rule.Includes[i] = sanitize(token)
+	}
+	for i, token := range rule.Excludes {
+		rule.Excludes[i] = sanitize(token)
+	}
+
+	sortTokens(rule.Includes)
+	sortTokens(rule.Excludes)
+
+	rule.excludes = map[string]struct{}{}
+	for _, s := range rule.Excludes {
+		rule.excludes[s] = struct{}{}
+	}
+
+	rule.score = rule.computeScore()
+}
+
+func (r Rule) maxNGram() int {
+	max := 0
+	for _, s := range r.Includes {
+		n := ngramLength(s)
+		if n > max {
+			max = n
+		}
+	}
+	for _, s := range r.Excludes {
+		n := ngramLength(s)
+		if n > max {
+			max = n
+		}
+	}
+
+	return max
+}
+
+func (r Rule) isExcluded(tokens []string) bool {
+	// This is most often the case.
+	if len(r.excludes) == 0 {
+		return false
+	}
+
+	for _, s := range tokens {
+		if _, ok := r.excludes[s]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+func (r Rule) computeScore() (score int) {
+	for _, token := range r.Includes {
+		n := ngramLength(token)
+		score += n * (n + 1) / 2
+	}
+	return score
+}
+
+func ruleLess(lhs, rhs *Rule) bool {
+	// If scores differ, sort by score.
+	if lhs.score != rhs.score {
+		return lhs.score < rhs.score
+	}
+
+	// If include depth differs, sort by depth.
+	lDepth := len(lhs.Includes)
+	rDepth := len(rhs.Includes)
+
+	if lDepth != rDepth {
+		return lDepth < rDepth
+	}
+
+	// If exclude depth differs, sort by depth.
+	lDepth = len(lhs.Excludes)
+	rDepth = len(rhs.Excludes)
+
+	if lDepth != rDepth {
+		return lDepth < rDepth
+	}
+
+	// Sort alphabetically by includes.
+	for i := range lhs.Includes {
+		if lhs.Includes[i] != rhs.Includes[i] {
+			return lhs.Includes[i] < rhs.Includes[i]
+		}
+	}
+
+	// Sort by alphabetically by excludes.
+	for i := range lhs.Excludes {
+		if lhs.Excludes[i] != rhs.Excludes[i] {
+			return lhs.Excludes[i] < rhs.Excludes[i]
+		}
+	}
+
+	// Sort by tag.
+	if lhs.Tag != rhs.Tag {
+		return lhs.Tag < rhs.Tag
+	}
+
+	return false
+}
--- a/rulegroup.go
+++ b/rulegroup.go
@ -0,0 +1,57 @@
+package tagengine
+
+// A RuleGroup can be converted into a list of rules. Each rule will point to
+// the same tag, and have the same exclude set.
+type RuleGroup struct {
+	Tag      string
+	Includes [][]string
+	Excludes []string
+	Blocks   []string
+}
+
+func NewRuleGroup(tag string) RuleGroup {
+	return RuleGroup{
+		Tag:      tag,
+		Includes: [][]string{},
+		Excludes: []string{},
+		Blocks:   []string{},
+	}
+}
+
+func (g RuleGroup) Inc(l ...string) RuleGroup {
+	return RuleGroup{
+		Tag:      g.Tag,
+		Includes: append(g.Includes, l),
+		Excludes: g.Excludes,
+		Blocks:   g.Blocks,
+	}
+}
+
+func (g RuleGroup) Exc(l ...string) RuleGroup {
+	return RuleGroup{
+		Tag:      g.Tag,
+		Includes: g.Includes,
+		Excludes: l,
+		Blocks:   g.Blocks,
+	}
+}
+
+func (g RuleGroup) Block(l ...string) RuleGroup {
+	return RuleGroup{
+		Tag:      g.Tag,
+		Includes: g.Includes,
+		Excludes: g.Excludes,
+		Blocks:   l,
+	}
+}
+
+func (rg RuleGroup) ToList() (l []Rule) {
+	for _, includes := range rg.Includes {
+		l = append(l, Rule{
+			Tag:      rg.Tag,
+			Excludes: rg.Excludes,
+			Includes: includes,
+		})
+	}
+	return
+}
--- a/ruleset.go
+++ b/ruleset.go
@ -0,0 +1,181 @@
+package tagengine
+
+import (
+	"sort"
+)
+
+type RuleSet struct {
+	root     *node
+	maxNgram int
+	sanitize func(...string) string
+	rules    []*Rule
+}
+
+func NewRuleSet() *RuleSet {
+	return &RuleSet{
+		root: &node{
+			Token:    "/",
+			Children: map[string]*node{},
+		},
+		sanitize: newSanitizer(),
+		rules:    []*Rule{},
+	}
+}
+
+func NewRuleSetFromList(rules []Rule) *RuleSet {
+	rs := NewRuleSet()
+	rs.AddRule(rules...)
+	return rs
+}
+
+func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
+	for _, ix := range ruleOrGroup {
+		switch x := ix.(type) {
+		case Rule:
+			t.AddRule(x)
+		case RuleGroup:
+			t.AddRuleGroup(x)
+		default:
+			panic("Add expects either Rule or RuleGroup objects.")
+		}
+	}
+}
+
+func (t *RuleSet) AddRule(rules ...Rule) {
+	for _, rule := range rules {
+		rule := rule
+
+		// Make sure rule is well-formed.
+		rule.normalize()
+
+		// Update maxNgram.
+		N := rule.maxNGram()
+		if N > t.maxNgram {
+			t.maxNgram = N
+		}
+
+		t.rules = append(t.rules, &rule)
+		t.root.AddRule(&rule)
+	}
+}
+
+func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
+	for _, rg := range ruleGroups {
+		t.AddRule(rg.ToList()...)
+	}
+}
+
+// MatchRules will return a list of all matching rules. The rules are sorted by
+// the match's "score". The best match will be first.
+func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
+	input = t.sanitize(input)
+	tokens := Tokenize(input, t.maxNgram)
+
+	rules = t.root.Match(tokens)
+	if len(rules) == 0 {
+		return rules
+	}
+
+	// Check excludes.
+	l := rules[:0]
+	for _, r := range rules {
+		if !r.isExcluded(tokens) {
+			l = append(l, r)
+		}
+	}
+
+	rules = l
+
+	// Sort rules descending.
+	sort.Slice(rules, func(i, j int) bool {
+		return ruleLess(rules[j], rules[i])
+	})
+
+	// Update rule stats.
+	if len(rules) > 0 {
+		rules[0].FirstCount++
+		for _, r := range rules {
+			r.MatchCount++
+		}
+	}
+
+	return rules
+}
+
+type Match struct {
+	Tag        string
+	Confidence float64 // In the range (0,1].
+}
+
+// Return a list of matches with confidence.
+func (t *RuleSet) Match(input string) []Match {
+	rules := t.MatchRules(input)
+	if len(rules) == 0 {
+		return []Match{}
+	}
+	if len(rules) == 1 {
+		return []Match{{
+			Tag:        rules[0].Tag,
+			Confidence: 1,
+		}}
+	}
+
+	// Create list of blocked tags.
+	blocks := map[string]struct{}{}
+	for _, rule := range rules {
+		for _, tag := range rule.Blocks {
+			blocks[tag] = struct{}{}
+		}
+	}
+
+	// Remove rules for blocked tags.
+	iOut := 0
+	for _, rule := range rules {
+		if _, ok := blocks[rule.Tag]; ok {
+			continue
+		}
+		rules[iOut] = rule
+		iOut++
+	}
+	rules = rules[:iOut]
+
+	// Matches by index.
+	matches := map[string]int{}
+	out := []Match{}
+	sum := float64(0)
+
+	for _, rule := range rules {
+		idx, ok := matches[rule.Tag]
+		if !ok {
+			idx = len(matches)
+			matches[rule.Tag] = idx
+			out = append(out, Match{Tag: rule.Tag})
+		}
+		out[idx].Confidence += float64(rule.score)
+		sum += float64(rule.score)
+	}
+
+	for i := range out {
+		out[i].Confidence /= sum
+	}
+
+	return out
+}
+
+// ListRules returns rules used in the ruleset sorted by the rules'
+// FirstCount. This is the number of times the given rule was the best match to
+// an input.
+func (t *RuleSet) ListRules() []*Rule {
+	sort.Slice(t.rules, func(i, j int) bool {
+		if t.rules[j].FirstCount != t.rules[i].FirstCount {
+			return t.rules[j].FirstCount < t.rules[i].FirstCount
+		}
+
+		if t.rules[j].MatchCount != t.rules[i].MatchCount {
+			return t.rules[j].MatchCount < t.rules[i].MatchCount
+		}
+
+		return t.rules[j].Tag < t.rules[i].Tag
+	})
+	return t.rules
+}
--- a/ruleset_test.go
+++ b/ruleset_test.go
@ -0,0 +1,84 @@
+package tagengine
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestRulesSet(t *testing.T) {
+	rs := NewRuleSet()
+	rs.AddRule(Rule{
+		Tag:      "cc/2",
+		Includes: []string{"cola", "coca"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cc/0",
+		Includes: []string{"coca cola"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cz/2",
+		Includes: []string{"coca", "zero"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cc0/3",
+		Includes: []string{"zero", "coca", "cola"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cc0/3.1",
+		Includes: []string{"coca", "cola", "zero"},
+		Excludes: []string{"pepsi"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "spa",
+		Includes: []string{"spa"},
+		Blocks:   []string{"cc/0", "cc0/3", "cc0/3.1"},
+	})
+
+	type TestCase struct {
+		Input   string
+		Matches []Match
+	}
+
+	cases := []TestCase{
+		{
+			Input: "coca-cola zero",
+			Matches: []Match{
+				{"cc0/3.1", 0.3},
+				{"cc0/3", 0.3},
+				{"cz/2", 0.2},
+				{"cc/2", 0.2},
+			},
+		}, {
+			Input: "coca cola",
+			Matches: []Match{
+				{"cc/0", 0.6},
+				{"cc/2", 0.4},
+			},
+		}, {
+			Input: "coca cola zero pepsi",
+			Matches: []Match{
+				{"cc0/3", 0.3},
+				{"cc/0", 0.3},
+				{"cz/2", 0.2},
+				{"cc/2", 0.2},
+			},
+		}, {
+			Input:   "fanta orange",
+			Matches: []Match{},
+		}, {
+			Input: "coca-cola zero / fanta / spa",
+			Matches: []Match{
+				{"cz/2", 0.4},
+				{"cc/2", 0.4},
+				{"spa", 0.2},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		matches := rs.Match(tc.Input)
+		if !reflect.DeepEqual(matches, tc.Matches) {
+			t.Fatalf("%v != %v", matches, tc.Matches)
+		}
+	}
+}
--- a/sanitize.go
+++ b/sanitize.go
@ -0,0 +1,125 @@
+package tagengine
+
+import (
+	"strings"
+	"unicode"
+
+	"golang.org/x/text/runes"
+	"golang.org/x/text/transform"
+	"golang.org/x/text/unicode/norm"
+)
+
+func newSanitizer() func(...string) string {
+	diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+
+	return func(l ...string) string {
+
+		s := strings.Join(l, " ")
+
+		// Lowercase.
+		s = strings.ToLower(s)
+
+		// Remove apostrophes.
+		s = strings.ReplaceAll(s, "ß", "ss")
+		s = strings.ReplaceAll(s, "'s", "s")
+		s = strings.ReplaceAll(s, "`s", "s")
+		s = strings.ReplaceAll(s, "´s", "s")
+
+		// Remove diacritics.
+		if out, _, err := transform.String(diactricsFix, s); err == nil {
+			s = out
+		}
+
+		// Clean spaces.
+		s = spaceNumbers(s)
+		s = addSpaces(s)
+		s = collapseSpaces(s)
+
+		return s
+	}
+}
+
+func spaceNumbers(s string) string {
+	if len(s) == 0 {
+		return s
+	}
+
+	isDigit := func(b rune) bool {
+		switch b {
+		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+			return true
+		}
+		return false
+	}
+
+	b := strings.Builder{}
+
+	var first rune
+	for _, c := range s {
+		first = c
+		break
+	}
+
+	digit := isDigit(first)
+
+	// Range over runes.
+	for _, c := range s {
+		thisDigit := isDigit(c)
+		if thisDigit != digit {
+			b.WriteByte(' ')
+			digit = thisDigit
+		}
+		b.WriteRune(c)
+	}
+
+	return b.String()
+}
+
+func addSpaces(s string) string {
+	needsSpace := func(r rune) bool {
+		switch r {
+		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
+			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
+			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
+			return true
+		}
+		return false
+	}
+
+	b := strings.Builder{}
+
+	// Range over runes.
+	for _, r := range s {
+		if needsSpace(r) {
+			b.WriteRune(' ')
+			b.WriteRune(r)
+			b.WriteRune(' ')
+		} else {
+			b.WriteRune(r)
+		}
+	}
+
+	return b.String()
+}
+
+func collapseSpaces(s string) string {
+	// Trim leading and trailing spaces.
+	s = strings.TrimSpace(s)
+
+	b := strings.Builder{}
+	wasSpace := false
+
+	// Range over runes.
+	for _, c := range s {
+		if unicode.IsSpace(c) {
+			wasSpace = true
+			continue
+		} else if wasSpace {
+			wasSpace = false
+			b.WriteRune(' ')
+		}
+		b.WriteRune(c)
+	}
+
+	return b.String()
+}
--- a/sanitize_test.go
+++ b/sanitize_test.go
@ -0,0 +1,35 @@
+package tagengine
+
+import "testing"
+
+func TestSanitize(t *testing.T) {
+	sanitize := newSanitizer()
+
+	type Case struct {
+		In  string
+		Out string
+	}
+
+	cases := []Case{
+		{"", ""},
+		{"123abc", "123 abc"},
+		{"abc123", "abc 123"},
+		{"abc123xyz", "abc 123 xyz"},
+		{"1f2", "1 f 2"},
+		{" abc", "abc"},
+		{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
+		{" Pott`s gin   königs beer;SOJU  ", "potts gin konigs beer ; soju"},
+		{"brot & brötchen", "brot & brotchen"},
+		{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
+		{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
+		{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
+		{"€", "€"},
+	}
+
+	for _, tc := range cases {
+		out := sanitize(tc.In)
+		if out != tc.Out {
+			t.Fatalf("%v != %v", out, tc.Out)
+		}
+	}
+}
--- a/tokenize.go
+++ b/tokenize.go
@ -0,0 +1,63 @@
+package tagengine
+
+import (
+	"sort"
+	"strings"
+)
+
+var ignoreTokens = map[string]struct{}{}
+
+func init() {
+	// These on their own are ignored.
+	tokens := []string{
+		"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
+		`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
+		`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
+	}
+	for _, s := range tokens {
+		ignoreTokens[s] = struct{}{}
+	}
+}
+
+func Tokenize(
+	input string,
+	maxNgram int,
+) (
+	tokens []string,
+) {
+	// Avoid duplicate ngrams.
+	ignored := map[string]bool{}
+
+	fields := strings.Fields(input)
+
+	if len(fields) < maxNgram {
+		maxNgram = len(fields)
+	}
+
+	for i := 1; i < maxNgram+1; i++ {
+		jMax := len(fields) - i + 1
+
+		for j := 0; j < jMax; j++ {
+			ngram := strings.Join(fields[j:i+j], " ")
+			if _, ok := ignoreTokens[ngram]; !ok {
+				if _, ok := ignored[ngram]; !ok {
+					tokens = append(tokens, ngram)
+					ignored[ngram] = true
+				}
+			}
+		}
+	}
+
+	sortTokens(tokens)
+
+	return tokens
+}
+
+func sortTokens(tokens []string) {
+	sort.Slice(tokens, func(i, j int) bool {
+		if len(tokens[i]) != len(tokens[j]) {
+			return len(tokens[i]) < len(tokens[j])
+		}
+		return tokens[i] < tokens[j]
+	})
+}
--- a/tokenize_test.go
+++ b/tokenize_test.go
@ -0,0 +1,55 @@
+package tagengine
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestTokenize(t *testing.T) {
+	type Case struct {
+		Input    string
+		MaxNgram int
+		Output   []string
+	}
+
+	cases := []Case{
+		{
+			Input:    "a bb c d",
+			MaxNgram: 3,
+			Output: []string{
+				"a", "c", "d", "bb",
+				"c d", "a bb", "bb c",
+				"a bb c", "bb c d",
+			},
+		}, {
+			Input:    "a b",
+			MaxNgram: 3,
+			Output: []string{
+				"a", "b", "a b",
+			},
+		}, {
+			Input:    "- b c d",
+			MaxNgram: 3,
+			Output: []string{
+				"b", "c", "d",
+				"- b", "b c", "c d",
+				"- b c", "b c d",
+			},
+		}, {
+			Input:    "a a b c d c d",
+			MaxNgram: 3,
+			Output: []string{
+				"a", "b", "c", "d",
+				"a a", "a b", "b c", "c d", "d c",
+				"a a b", "a b c", "b c d", "c d c", "d c d",
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		output := Tokenize(tc.Input, tc.MaxNgram)
+		if !reflect.DeepEqual(output, tc.Output) {
+			t.Fatalf("%s: %#v", tc.Input, output)
+		}
+	}
+}