Forked from subrubia

2021-09-09 12:25:53 +02:00 · 2021-09-09 12:25:53 +02:00 · 0a77a882f1
parent d04606923b
commit 0a77a882f1
14 changed files with 920 additions and 0 deletions
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 MIT License
 Copyright (c) 2020 Suburbia
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 in the Software without restriction, including without limitation the rights
 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 copies of the Software, and to permit persons to whom the Software is
 furnished to do so, subject to the following conditions:
 The above copyright notice and this permission notice shall be included in all
 copies or substantial portions of the Software.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
--- a/go.mod
+++ b/go.mod
@ -0,0 +1,5 @@
 module git.crumpington.com/public/tagengine
 go 1.17
 require golang.org/x/text v0.3.7
--- a/go.sum
+++ b/go.sum
@ -0,0 +1,3 @@
 golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/ngram.go
+++ b/ngram.go
@ -0,0 +1,30 @@
 package tagengine
 import "unicode"
 func ngramLength(s string) int {
 	N := len(s)
 	i := 0
 	count := 0
 	for {
 		// Eat spaces.
 		for i < N && unicode.IsSpace(rune(s[i])) {
 			i++
 		}
 		// Done?
 		if i == N {
 			break
 		}
 		// Non-space!
 		count++
 		// Eat non-spaces.
 		for i < N && !unicode.IsSpace(rune(s[i])) {
 			i++
 		}
 	}
 	return count
 }
--- a/ngram_test.go
+++ b/ngram_test.go
@ -0,0 +1,31 @@
 package tagengine
 import (
 	"log"
 	"testing"
 )
 func TestNGramLength(t *testing.T) {
 	type Case struct {
 		Input  string
 		Length int
 	}
 	cases := []Case{
 		{"a b c", 3},
 		{"  xyz\nlkj  dflaj a", 4},
 		{"a", 1},
 		{" a", 1},
 		{"a", 1},
 		{" a\n", 1},
 		{" a ", 1},
 		{"\tx\ny\nz q ", 4},
 	}
 	for _, tc := range cases {
 		length := ngramLength(tc.Input)
 		if length != tc.Length {
 			log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
 		}
 	}
 }
--- a/node.go
+++ b/node.go
@ -0,0 +1,79 @@
 package tagengine
 import (
 	"fmt"
 	"strings"
 )
 type node struct {
 	Token    string
 	Matches  []*Rule
 	Children map[string]*node
 }
 func (n *node) AddRule(r *Rule) {
 	n.addRule(r, 0)
 }
 func (n *node) addRule(r *Rule, idx int) {
 	if len(r.Includes) == idx {
 		n.Matches = append(n.Matches, r)
 		return
 	}
 	token := r.Includes[idx]
 	child, ok := n.Children[token]
 	if !ok {
 		child = &node{
 			Token:    token,
 			Children: map[string]*node{},
 		}
 		n.Children[token] = child
 	}
 	child.addRule(r, idx+1)
 }
 // Note that tokens must be sorted. This is the case for tokens created from
 // the tokenize function.
 func (n *node) Match(tokens []string) (rules []*Rule) {
 	return n.match(tokens, rules)
 }
 func (n *node) match(tokens []string, rules []*Rule) []*Rule {
 	// Check for a match.
 	if n.Matches != nil {
 		rules = append(rules, n.Matches...)
 	}
 	if len(tokens) == 0 {
 		return rules
 	}
 	// Attempt to match children.
 	for i := 0; i < len(tokens); i++ {
 		token := tokens[i]
 		if child, ok := n.Children[token]; ok {
 			rules = child.match(tokens[i+1:], rules)
 		}
 	}
 	return rules
 }
 func (n *node) Dump() {
 	n.dump(0)
 }
 func (n *node) dump(depth int) {
 	indent := strings.Repeat(" ", 2*depth)
 	tag := ""
 	for _, m := range n.Matches {
 		tag += " " + m.Tag
 	}
 	fmt.Printf("%s%s%s\n", indent, n.Token, tag)
 	for _, child := range n.Children {
 		child.dump(depth + 1)
 	}
 }
--- a/rule.go
+++ b/rule.go
@ -0,0 +1,151 @@
 package tagengine
 type Rule struct {
 	Tag      string
 	Includes []string
 	Excludes []string
 	Blocks   []string // List of blocked tags.
 	MatchCount int
 	FirstCount int
 	score int
 	excludes map[string]struct{}
 }
 func NewRule(tag string) Rule {
 	return Rule{Tag: tag}
 }
 func (r Rule) Inc(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
 		Includes: l,
 		Excludes: r.Excludes,
 		Blocks:   r.Blocks,
 	}
 }
 func (r Rule) Exc(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
 		Includes: r.Includes,
 		Excludes: l,
 		Blocks:   r.Blocks,
 	}
 }
 func (r Rule) Block(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
 		Includes: r.Includes,
 		Excludes: r.Excludes,
 		Blocks:   l,
 	}
 }
 func (rule *Rule) normalize() {
 	sanitize := newSanitizer()
 	for i, token := range rule.Includes {
 		rule.Includes[i] = sanitize(token)
 	}
 	for i, token := range rule.Excludes {
 		rule.Excludes[i] = sanitize(token)
 	}
 	sortTokens(rule.Includes)
 	sortTokens(rule.Excludes)
 	rule.excludes = map[string]struct{}{}
 	for _, s := range rule.Excludes {
 		rule.excludes[s] = struct{}{}
 	}
 	rule.score = rule.computeScore()
 }
 func (r Rule) maxNGram() int {
 	max := 0
 	for _, s := range r.Includes {
 		n := ngramLength(s)
 		if n > max {
 			max = n
 		}
 	}
 	for _, s := range r.Excludes {
 		n := ngramLength(s)
 		if n > max {
 			max = n
 		}
 	}
 	return max
 }
 func (r Rule) isExcluded(tokens []string) bool {
 	// This is most often the case.
 	if len(r.excludes) == 0 {
 		return false
 	}
 	for _, s := range tokens {
 		if _, ok := r.excludes[s]; ok {
 			return true
 		}
 	}
 	return false
 }
 func (r Rule) computeScore() (score int) {
 	for _, token := range r.Includes {
 		n := ngramLength(token)
 		score += n * (n + 1) / 2
 	}
 	return score
 }
 func ruleLess(lhs, rhs *Rule) bool {
 	// If scores differ, sort by score.
 	if lhs.score != rhs.score {
 		return lhs.score < rhs.score
 	}
 	// If include depth differs, sort by depth.
 	lDepth := len(lhs.Includes)
 	rDepth := len(rhs.Includes)
 	if lDepth != rDepth {
 		return lDepth < rDepth
 	}
 	// If exclude depth differs, sort by depth.
 	lDepth = len(lhs.Excludes)
 	rDepth = len(rhs.Excludes)
 	if lDepth != rDepth {
 		return lDepth < rDepth
 	}
 	// Sort alphabetically by includes.
 	for i := range lhs.Includes {
 		if lhs.Includes[i] != rhs.Includes[i] {
 			return lhs.Includes[i] < rhs.Includes[i]
 		}
 	}
 	// Sort by alphabetically by excludes.
 	for i := range lhs.Excludes {
 		if lhs.Excludes[i] != rhs.Excludes[i] {
 			return lhs.Excludes[i] < rhs.Excludes[i]
 		}
 	}
 	// Sort by tag.
 	if lhs.Tag != rhs.Tag {
 		return lhs.Tag < rhs.Tag
 	}
 	return false
 }
--- a/rulegroup.go
+++ b/rulegroup.go
@ -0,0 +1,57 @@
 package tagengine
 // A RuleGroup can be converted into a list of rules. Each rule will point to
 // the same tag, and have the same exclude set.
 type RuleGroup struct {
 	Tag      string
 	Includes [][]string
 	Excludes []string
 	Blocks   []string
 }
 func NewRuleGroup(tag string) RuleGroup {
 	return RuleGroup{
 		Tag:      tag,
 		Includes: [][]string{},
 		Excludes: []string{},
 		Blocks:   []string{},
 	}
 }
 func (g RuleGroup) Inc(l ...string) RuleGroup {
 	return RuleGroup{
 		Tag:      g.Tag,
 		Includes: append(g.Includes, l),
 		Excludes: g.Excludes,
 		Blocks:   g.Blocks,
 	}
 }
 func (g RuleGroup) Exc(l ...string) RuleGroup {
 	return RuleGroup{
 		Tag:      g.Tag,
 		Includes: g.Includes,
 		Excludes: l,
 		Blocks:   g.Blocks,
 	}
 }
 func (g RuleGroup) Block(l ...string) RuleGroup {
 	return RuleGroup{
 		Tag:      g.Tag,
 		Includes: g.Includes,
 		Excludes: g.Excludes,
 		Blocks:   l,
 	}
 }
 func (rg RuleGroup) ToList() (l []Rule) {
 	for _, includes := range rg.Includes {
 		l = append(l, Rule{
 			Tag:      rg.Tag,
 			Excludes: rg.Excludes,
 			Includes: includes,
 		})
 	}
 	return
 }
--- a/ruleset.go
+++ b/ruleset.go
@ -0,0 +1,181 @@
 package tagengine
 import (
 	"sort"
 )
 type RuleSet struct {
 	root     *node
 	maxNgram int
 	sanitize func(...string) string
 	rules    []*Rule
 }
 func NewRuleSet() *RuleSet {
 	return &RuleSet{
 		root: &node{
 			Token:    "/",
 			Children: map[string]*node{},
 		},
 		sanitize: newSanitizer(),
 		rules:    []*Rule{},
 	}
 }
 func NewRuleSetFromList(rules []Rule) *RuleSet {
 	rs := NewRuleSet()
 	rs.AddRule(rules...)
 	return rs
 }
 func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
 	for _, ix := range ruleOrGroup {
 		switch x := ix.(type) {
 		case Rule:
 			t.AddRule(x)
 		case RuleGroup:
 			t.AddRuleGroup(x)
 		default:
 			panic("Add expects either Rule or RuleGroup objects.")
 		}
 	}
 }
 func (t *RuleSet) AddRule(rules ...Rule) {
 	for _, rule := range rules {
 		rule := rule
 		// Make sure rule is well-formed.
 		rule.normalize()
 		// Update maxNgram.
 		N := rule.maxNGram()
 		if N > t.maxNgram {
 			t.maxNgram = N
 		}
 		t.rules = append(t.rules, &rule)
 		t.root.AddRule(&rule)
 	}
 }
 func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
 	for _, rg := range ruleGroups {
 		t.AddRule(rg.ToList()...)
 	}
 }
 // MatchRules will return a list of all matching rules. The rules are sorted by
 // the match's "score". The best match will be first.
 func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
 	input = t.sanitize(input)
 	tokens := Tokenize(input, t.maxNgram)
 	rules = t.root.Match(tokens)
 	if len(rules) == 0 {
 		return rules
 	}
 	// Check excludes.
 	l := rules[:0]
 	for _, r := range rules {
 		if !r.isExcluded(tokens) {
 			l = append(l, r)
 		}
 	}
 	rules = l
 	// Sort rules descending.
 	sort.Slice(rules, func(i, j int) bool {
 		return ruleLess(rules[j], rules[i])
 	})
 	// Update rule stats.
 	if len(rules) > 0 {
 		rules[0].FirstCount++
 		for _, r := range rules {
 			r.MatchCount++
 		}
 	}
 	return rules
 }
 type Match struct {
 	Tag        string
 	Confidence float64 // In the range (0,1].
 }
 // Return a list of matches with confidence.
 func (t *RuleSet) Match(input string) []Match {
 	rules := t.MatchRules(input)
 	if len(rules) == 0 {
 		return []Match{}
 	}
 	if len(rules) == 1 {
 		return []Match{{
 			Tag:        rules[0].Tag,
 			Confidence: 1,
 		}}
 	}
 	// Create list of blocked tags.
 	blocks := map[string]struct{}{}
 	for _, rule := range rules {
 		for _, tag := range rule.Blocks {
 			blocks[tag] = struct{}{}
 		}
 	}
 	// Remove rules for blocked tags.
 	iOut := 0
 	for _, rule := range rules {
 		if _, ok := blocks[rule.Tag]; ok {
 			continue
 		}
 		rules[iOut] = rule
 		iOut++
 	}
 	rules = rules[:iOut]
 	// Matches by index.
 	matches := map[string]int{}
 	out := []Match{}
 	sum := float64(0)
 	for _, rule := range rules {
 		idx, ok := matches[rule.Tag]
 		if !ok {
 			idx = len(matches)
 			matches[rule.Tag] = idx
 			out = append(out, Match{Tag: rule.Tag})
 		}
 		out[idx].Confidence += float64(rule.score)
 		sum += float64(rule.score)
 	}
 	for i := range out {
 		out[i].Confidence /= sum
 	}
 	return out
 }
 // ListRules returns rules used in the ruleset sorted by the rules'
 // FirstCount. This is the number of times the given rule was the best match to
 // an input.
 func (t *RuleSet) ListRules() []*Rule {
 	sort.Slice(t.rules, func(i, j int) bool {
 		if t.rules[j].FirstCount != t.rules[i].FirstCount {
 			return t.rules[j].FirstCount < t.rules[i].FirstCount
 		}
 		if t.rules[j].MatchCount != t.rules[i].MatchCount {
 			return t.rules[j].MatchCount < t.rules[i].MatchCount
 		}
 		return t.rules[j].Tag < t.rules[i].Tag
 	})
 	return t.rules
 }
--- a/ruleset_test.go
+++ b/ruleset_test.go
@ -0,0 +1,84 @@
 package tagengine
 import (
 	"reflect"
 	"testing"
 )
 func TestRulesSet(t *testing.T) {
 	rs := NewRuleSet()
 	rs.AddRule(Rule{
 		Tag:      "cc/2",
 		Includes: []string{"cola", "coca"},
 	})
 	rs.AddRule(Rule{
 		Tag:      "cc/0",
 		Includes: []string{"coca cola"},
 	})
 	rs.AddRule(Rule{
 		Tag:      "cz/2",
 		Includes: []string{"coca", "zero"},
 	})
 	rs.AddRule(Rule{
 		Tag:      "cc0/3",
 		Includes: []string{"zero", "coca", "cola"},
 	})
 	rs.AddRule(Rule{
 		Tag:      "cc0/3.1",
 		Includes: []string{"coca", "cola", "zero"},
 		Excludes: []string{"pepsi"},
 	})
 	rs.AddRule(Rule{
 		Tag:      "spa",
 		Includes: []string{"spa"},
 		Blocks:   []string{"cc/0", "cc0/3", "cc0/3.1"},
 	})
 	type TestCase struct {
 		Input   string
 		Matches []Match
 	}
 	cases := []TestCase{
 		{
 			Input: "coca-cola zero",
 			Matches: []Match{
 				{"cc0/3.1", 0.3},
 				{"cc0/3", 0.3},
 				{"cz/2", 0.2},
 				{"cc/2", 0.2},
 			},
 		}, {
 			Input: "coca cola",
 			Matches: []Match{
 				{"cc/0", 0.6},
 				{"cc/2", 0.4},
 			},
 		}, {
 			Input: "coca cola zero pepsi",
 			Matches: []Match{
 				{"cc0/3", 0.3},
 				{"cc/0", 0.3},
 				{"cz/2", 0.2},
 				{"cc/2", 0.2},
 			},
 		}, {
 			Input:   "fanta orange",
 			Matches: []Match{},
 		}, {
 			Input: "coca-cola zero / fanta / spa",
 			Matches: []Match{
 				{"cz/2", 0.4},
 				{"cc/2", 0.4},
 				{"spa", 0.2},
 			},
 		},
 	}
 	for _, tc := range cases {
 		matches := rs.Match(tc.Input)
 		if !reflect.DeepEqual(matches, tc.Matches) {
 			t.Fatalf("%v != %v", matches, tc.Matches)
 		}
 	}
 }
--- a/sanitize.go
+++ b/sanitize.go
@ -0,0 +1,125 @@
 package tagengine
 import (
 	"strings"
 	"unicode"
 	"golang.org/x/text/runes"
 	"golang.org/x/text/transform"
 	"golang.org/x/text/unicode/norm"
 )
 func newSanitizer() func(...string) string {
 	diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
 	return func(l ...string) string {
 		s := strings.Join(l, " ")
 		// Lowercase.
 		s = strings.ToLower(s)
 		// Remove apostrophes.
 		s = strings.ReplaceAll(s, "ß", "ss")
 		s = strings.ReplaceAll(s, "'s", "s")
 		s = strings.ReplaceAll(s, "`s", "s")
 		s = strings.ReplaceAll(s, "´s", "s")
 		// Remove diacritics.
 		if out, _, err := transform.String(diactricsFix, s); err == nil {
 			s = out
 		}
 		// Clean spaces.
 		s = spaceNumbers(s)
 		s = addSpaces(s)
 		s = collapseSpaces(s)
 		return s
 	}
 }
 func spaceNumbers(s string) string {
 	if len(s) == 0 {
 		return s
 	}
 	isDigit := func(b rune) bool {
 		switch b {
 		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 			return true
 		}
 		return false
 	}
 	b := strings.Builder{}
 	var first rune
 	for _, c := range s {
 		first = c
 		break
 	}
 	digit := isDigit(first)
 	// Range over runes.
 	for _, c := range s {
 		thisDigit := isDigit(c)
 		if thisDigit != digit {
 			b.WriteByte(' ')
 			digit = thisDigit
 		}
 		b.WriteRune(c)
 	}
 	return b.String()
 }
 func addSpaces(s string) string {
 	needsSpace := func(r rune) bool {
 		switch r {
 		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
 			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
 			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
 			return true
 		}
 		return false
 	}
 	b := strings.Builder{}
 	// Range over runes.
 	for _, r := range s {
 		if needsSpace(r) {
 			b.WriteRune(' ')
 			b.WriteRune(r)
 			b.WriteRune(' ')
 		} else {
 			b.WriteRune(r)
 		}
 	}
 	return b.String()
 }
 func collapseSpaces(s string) string {
 	// Trim leading and trailing spaces.
 	s = strings.TrimSpace(s)
 	b := strings.Builder{}
 	wasSpace := false
 	// Range over runes.
 	for _, c := range s {
 		if unicode.IsSpace(c) {
 			wasSpace = true
 			continue
 		} else if wasSpace {
 			wasSpace = false
 			b.WriteRune(' ')
 		}
 		b.WriteRune(c)
 	}
 	return b.String()
 }
--- a/sanitize_test.go
+++ b/sanitize_test.go
@ -0,0 +1,35 @@
 package tagengine
 import "testing"
 func TestSanitize(t *testing.T) {
 	sanitize := newSanitizer()
 	type Case struct {
 		In  string
 		Out string
 	}
 	cases := []Case{
 		{"", ""},
 		{"123abc", "123 abc"},
 		{"abc123", "abc 123"},
 		{"abc123xyz", "abc 123 xyz"},
 		{"1f2", "1 f 2"},
 		{" abc", "abc"},
 		{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
 		{" Pott`s gin   königs beer;SOJU  ", "potts gin konigs beer ; soju"},
 		{"brot & brötchen", "brot & brotchen"},
 		{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
 		{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
 		{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
 		{"€", "€"},
 	}
 	for _, tc := range cases {
 		out := sanitize(tc.In)
 		if out != tc.Out {
 			t.Fatalf("%v != %v", out, tc.Out)
 		}
 	}
 }
--- a/tokenize.go
+++ b/tokenize.go
@ -0,0 +1,63 @@
 package tagengine
 import (
 	"sort"
 	"strings"
 )
 var ignoreTokens = map[string]struct{}{}
 func init() {
 	// These on their own are ignored.
 	tokens := []string{
 		"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
 		`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
 		`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
 	}
 	for _, s := range tokens {
 		ignoreTokens[s] = struct{}{}
 	}
 }
 func Tokenize(
 	input string,
 	maxNgram int,
 ) (
 	tokens []string,
 ) {
 	// Avoid duplicate ngrams.
 	ignored := map[string]bool{}
 	fields := strings.Fields(input)
 	if len(fields) < maxNgram {
 		maxNgram = len(fields)
 	}
 	for i := 1; i < maxNgram+1; i++ {
 		jMax := len(fields) - i + 1
 		for j := 0; j < jMax; j++ {
 			ngram := strings.Join(fields[j:i+j], " ")
 			if _, ok := ignoreTokens[ngram]; !ok {
 				if _, ok := ignored[ngram]; !ok {
 					tokens = append(tokens, ngram)
 					ignored[ngram] = true
 				}
 			}
 		}
 	}
 	sortTokens(tokens)
 	return tokens
 }
 func sortTokens(tokens []string) {
 	sort.Slice(tokens, func(i, j int) bool {
 		if len(tokens[i]) != len(tokens[j]) {
 			return len(tokens[i]) < len(tokens[j])
 		}
 		return tokens[i] < tokens[j]
 	})
 }
--- a/tokenize_test.go
+++ b/tokenize_test.go
@ -0,0 +1,55 @@
 package tagengine
 import (
 	"reflect"
 	"testing"
 )
 func TestTokenize(t *testing.T) {
 	type Case struct {
 		Input    string
 		MaxNgram int
 		Output   []string
 	}
 	cases := []Case{
 		{
 			Input:    "a bb c d",
 			MaxNgram: 3,
 			Output: []string{
 				"a", "c", "d", "bb",
 				"c d", "a bb", "bb c",
 				"a bb c", "bb c d",
 			},
 		}, {
 			Input:    "a b",
 			MaxNgram: 3,
 			Output: []string{
 				"a", "b", "a b",
 			},
 		}, {
 			Input:    "- b c d",
 			MaxNgram: 3,
 			Output: []string{
 				"b", "c", "d",
 				"- b", "b c", "c d",
 				"- b c", "b c d",
 			},
 		}, {
 			Input:    "a a b c d c d",
 			MaxNgram: 3,
 			Output: []string{
 				"a", "b", "c", "d",
 				"a a", "a b", "b c", "c d", "d c",
 				"a a b", "a b c", "b c d", "c d c", "d c d",
 			},
 		},
 	}
 	for _, tc := range cases {
 		output := Tokenize(tc.Input, tc.MaxNgram)
 		if !reflect.DeepEqual(output, tc.Output) {
 			t.Fatalf("%s: %#v", tc.Input, output)
 		}
 	}
 }