Forked from subrubia
parent
d04606923b
commit
0a77a882f1
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2020 Suburbia
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -0,0 +1,5 @@
|
||||
module git.crumpington.com/public/tagengine
|
||||
|
||||
go 1.17
|
||||
|
||||
require golang.org/x/text v0.3.7
|
@ -0,0 +1,3 @@
|
||||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
@ -0,0 +1,30 @@
|
||||
package tagengine
|
||||
|
||||
import "unicode"
|
||||
|
||||
func ngramLength(s string) int {
|
||||
N := len(s)
|
||||
i := 0
|
||||
count := 0
|
||||
|
||||
for {
|
||||
// Eat spaces.
|
||||
for i < N && unicode.IsSpace(rune(s[i])) {
|
||||
i++
|
||||
}
|
||||
|
||||
// Done?
|
||||
if i == N {
|
||||
break
|
||||
}
|
||||
|
||||
// Non-space!
|
||||
count++
|
||||
|
||||
// Eat non-spaces.
|
||||
for i < N && !unicode.IsSpace(rune(s[i])) {
|
||||
i++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
@ -0,0 +1,31 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"log"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNGramLength(t *testing.T) {
|
||||
type Case struct {
|
||||
Input string
|
||||
Length int
|
||||
}
|
||||
|
||||
cases := []Case{
|
||||
{"a b c", 3},
|
||||
{" xyz\nlkj dflaj a", 4},
|
||||
{"a", 1},
|
||||
{" a", 1},
|
||||
{"a", 1},
|
||||
{" a\n", 1},
|
||||
{" a ", 1},
|
||||
{"\tx\ny\nz q ", 4},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
length := ngramLength(tc.Input)
|
||||
if length != tc.Length {
|
||||
log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,79 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type node struct {
|
||||
Token string
|
||||
Matches []*Rule
|
||||
Children map[string]*node
|
||||
}
|
||||
|
||||
func (n *node) AddRule(r *Rule) {
|
||||
n.addRule(r, 0)
|
||||
}
|
||||
|
||||
func (n *node) addRule(r *Rule, idx int) {
|
||||
if len(r.Includes) == idx {
|
||||
n.Matches = append(n.Matches, r)
|
||||
return
|
||||
}
|
||||
|
||||
token := r.Includes[idx]
|
||||
|
||||
child, ok := n.Children[token]
|
||||
if !ok {
|
||||
child = &node{
|
||||
Token: token,
|
||||
Children: map[string]*node{},
|
||||
}
|
||||
n.Children[token] = child
|
||||
}
|
||||
|
||||
child.addRule(r, idx+1)
|
||||
}
|
||||
|
||||
// Note that tokens must be sorted. This is the case for tokens created from
|
||||
// the tokenize function.
|
||||
func (n *node) Match(tokens []string) (rules []*Rule) {
|
||||
return n.match(tokens, rules)
|
||||
}
|
||||
|
||||
func (n *node) match(tokens []string, rules []*Rule) []*Rule {
|
||||
// Check for a match.
|
||||
if n.Matches != nil {
|
||||
rules = append(rules, n.Matches...)
|
||||
}
|
||||
|
||||
if len(tokens) == 0 {
|
||||
return rules
|
||||
}
|
||||
|
||||
// Attempt to match children.
|
||||
for i := 0; i < len(tokens); i++ {
|
||||
token := tokens[i]
|
||||
if child, ok := n.Children[token]; ok {
|
||||
rules = child.match(tokens[i+1:], rules)
|
||||
}
|
||||
}
|
||||
|
||||
return rules
|
||||
}
|
||||
|
||||
func (n *node) Dump() {
|
||||
n.dump(0)
|
||||
}
|
||||
|
||||
func (n *node) dump(depth int) {
|
||||
indent := strings.Repeat(" ", 2*depth)
|
||||
tag := ""
|
||||
for _, m := range n.Matches {
|
||||
tag += " " + m.Tag
|
||||
}
|
||||
fmt.Printf("%s%s%s\n", indent, n.Token, tag)
|
||||
for _, child := range n.Children {
|
||||
child.dump(depth + 1)
|
||||
}
|
||||
}
|
@ -0,0 +1,151 @@
|
||||
package tagengine
|
||||
|
||||
type Rule struct {
|
||||
Tag string
|
||||
Includes []string
|
||||
Excludes []string
|
||||
Blocks []string // List of blocked tags.
|
||||
|
||||
MatchCount int
|
||||
FirstCount int
|
||||
|
||||
score int
|
||||
|
||||
excludes map[string]struct{}
|
||||
}
|
||||
|
||||
func NewRule(tag string) Rule {
|
||||
return Rule{Tag: tag}
|
||||
}
|
||||
|
||||
func (r Rule) Inc(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: l,
|
||||
Excludes: r.Excludes,
|
||||
Blocks: r.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (r Rule) Exc(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: r.Includes,
|
||||
Excludes: l,
|
||||
Blocks: r.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (r Rule) Block(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: r.Includes,
|
||||
Excludes: r.Excludes,
|
||||
Blocks: l,
|
||||
}
|
||||
}
|
||||
|
||||
func (rule *Rule) normalize() {
|
||||
sanitize := newSanitizer()
|
||||
|
||||
for i, token := range rule.Includes {
|
||||
rule.Includes[i] = sanitize(token)
|
||||
}
|
||||
for i, token := range rule.Excludes {
|
||||
rule.Excludes[i] = sanitize(token)
|
||||
}
|
||||
|
||||
sortTokens(rule.Includes)
|
||||
sortTokens(rule.Excludes)
|
||||
|
||||
rule.excludes = map[string]struct{}{}
|
||||
for _, s := range rule.Excludes {
|
||||
rule.excludes[s] = struct{}{}
|
||||
}
|
||||
|
||||
rule.score = rule.computeScore()
|
||||
}
|
||||
|
||||
func (r Rule) maxNGram() int {
|
||||
max := 0
|
||||
for _, s := range r.Includes {
|
||||
n := ngramLength(s)
|
||||
if n > max {
|
||||
max = n
|
||||
}
|
||||
}
|
||||
for _, s := range r.Excludes {
|
||||
n := ngramLength(s)
|
||||
if n > max {
|
||||
max = n
|
||||
}
|
||||
}
|
||||
|
||||
return max
|
||||
}
|
||||
|
||||
func (r Rule) isExcluded(tokens []string) bool {
|
||||
// This is most often the case.
|
||||
if len(r.excludes) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, s := range tokens {
|
||||
if _, ok := r.excludes[s]; ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (r Rule) computeScore() (score int) {
|
||||
for _, token := range r.Includes {
|
||||
n := ngramLength(token)
|
||||
score += n * (n + 1) / 2
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func ruleLess(lhs, rhs *Rule) bool {
|
||||
// If scores differ, sort by score.
|
||||
if lhs.score != rhs.score {
|
||||
return lhs.score < rhs.score
|
||||
}
|
||||
|
||||
// If include depth differs, sort by depth.
|
||||
lDepth := len(lhs.Includes)
|
||||
rDepth := len(rhs.Includes)
|
||||
|
||||
if lDepth != rDepth {
|
||||
return lDepth < rDepth
|
||||
}
|
||||
|
||||
// If exclude depth differs, sort by depth.
|
||||
lDepth = len(lhs.Excludes)
|
||||
rDepth = len(rhs.Excludes)
|
||||
|
||||
if lDepth != rDepth {
|
||||
return lDepth < rDepth
|
||||
}
|
||||
|
||||
// Sort alphabetically by includes.
|
||||
for i := range lhs.Includes {
|
||||
if lhs.Includes[i] != rhs.Includes[i] {
|
||||
return lhs.Includes[i] < rhs.Includes[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by alphabetically by excludes.
|
||||
for i := range lhs.Excludes {
|
||||
if lhs.Excludes[i] != rhs.Excludes[i] {
|
||||
return lhs.Excludes[i] < rhs.Excludes[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by tag.
|
||||
if lhs.Tag != rhs.Tag {
|
||||
return lhs.Tag < rhs.Tag
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
@ -0,0 +1,57 @@
|
||||
package tagengine
|
||||
|
||||
// A RuleGroup can be converted into a list of rules. Each rule will point to
|
||||
// the same tag, and have the same exclude set.
|
||||
type RuleGroup struct {
|
||||
Tag string
|
||||
Includes [][]string
|
||||
Excludes []string
|
||||
Blocks []string
|
||||
}
|
||||
|
||||
func NewRuleGroup(tag string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: tag,
|
||||
Includes: [][]string{},
|
||||
Excludes: []string{},
|
||||
Blocks: []string{},
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) Inc(l ...string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: append(g.Includes, l),
|
||||
Excludes: g.Excludes,
|
||||
Blocks: g.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) Exc(l ...string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: g.Includes,
|
||||
Excludes: l,
|
||||
Blocks: g.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) Block(l ...string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: g.Includes,
|
||||
Excludes: g.Excludes,
|
||||
Blocks: l,
|
||||
}
|
||||
}
|
||||
|
||||
func (rg RuleGroup) ToList() (l []Rule) {
|
||||
for _, includes := range rg.Includes {
|
||||
l = append(l, Rule{
|
||||
Tag: rg.Tag,
|
||||
Excludes: rg.Excludes,
|
||||
Includes: includes,
|
||||
})
|
||||
}
|
||||
return
|
||||
}
|
@ -0,0 +1,181 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
type RuleSet struct {
|
||||
root *node
|
||||
maxNgram int
|
||||
sanitize func(...string) string
|
||||
rules []*Rule
|
||||
}
|
||||
|
||||
func NewRuleSet() *RuleSet {
|
||||
return &RuleSet{
|
||||
root: &node{
|
||||
Token: "/",
|
||||
Children: map[string]*node{},
|
||||
},
|
||||
sanitize: newSanitizer(),
|
||||
rules: []*Rule{},
|
||||
}
|
||||
}
|
||||
|
||||
func NewRuleSetFromList(rules []Rule) *RuleSet {
|
||||
rs := NewRuleSet()
|
||||
rs.AddRule(rules...)
|
||||
return rs
|
||||
}
|
||||
|
||||
func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
|
||||
for _, ix := range ruleOrGroup {
|
||||
switch x := ix.(type) {
|
||||
case Rule:
|
||||
t.AddRule(x)
|
||||
case RuleGroup:
|
||||
t.AddRuleGroup(x)
|
||||
default:
|
||||
panic("Add expects either Rule or RuleGroup objects.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *RuleSet) AddRule(rules ...Rule) {
|
||||
for _, rule := range rules {
|
||||
rule := rule
|
||||
|
||||
// Make sure rule is well-formed.
|
||||
rule.normalize()
|
||||
|
||||
// Update maxNgram.
|
||||
N := rule.maxNGram()
|
||||
if N > t.maxNgram {
|
||||
t.maxNgram = N
|
||||
}
|
||||
|
||||
t.rules = append(t.rules, &rule)
|
||||
t.root.AddRule(&rule)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
|
||||
for _, rg := range ruleGroups {
|
||||
t.AddRule(rg.ToList()...)
|
||||
}
|
||||
}
|
||||
|
||||
// MatchRules will return a list of all matching rules. The rules are sorted by
|
||||
// the match's "score". The best match will be first.
|
||||
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
||||
input = t.sanitize(input)
|
||||
tokens := Tokenize(input, t.maxNgram)
|
||||
|
||||
rules = t.root.Match(tokens)
|
||||
if len(rules) == 0 {
|
||||
return rules
|
||||
}
|
||||
|
||||
// Check excludes.
|
||||
l := rules[:0]
|
||||
for _, r := range rules {
|
||||
if !r.isExcluded(tokens) {
|
||||
l = append(l, r)
|
||||
}
|
||||
}
|
||||
|
||||
rules = l
|
||||
|
||||
// Sort rules descending.
|
||||
sort.Slice(rules, func(i, j int) bool {
|
||||
return ruleLess(rules[j], rules[i])
|
||||
})
|
||||
|
||||
// Update rule stats.
|
||||
if len(rules) > 0 {
|
||||
rules[0].FirstCount++
|
||||
for _, r := range rules {
|
||||
r.MatchCount++
|
||||
}
|
||||
}
|
||||
|
||||
return rules
|
||||
}
|
||||
|
||||
type Match struct {
|
||||
Tag string
|
||||
Confidence float64 // In the range (0,1].
|
||||
}
|
||||
|
||||
// Return a list of matches with confidence.
|
||||
func (t *RuleSet) Match(input string) []Match {
|
||||
rules := t.MatchRules(input)
|
||||
if len(rules) == 0 {
|
||||
return []Match{}
|
||||
}
|
||||
if len(rules) == 1 {
|
||||
return []Match{{
|
||||
Tag: rules[0].Tag,
|
||||
Confidence: 1,
|
||||
}}
|
||||
}
|
||||
|
||||
// Create list of blocked tags.
|
||||
blocks := map[string]struct{}{}
|
||||
for _, rule := range rules {
|
||||
for _, tag := range rule.Blocks {
|
||||
blocks[tag] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove rules for blocked tags.
|
||||
iOut := 0
|
||||
for _, rule := range rules {
|
||||
if _, ok := blocks[rule.Tag]; ok {
|
||||
continue
|
||||
}
|
||||
rules[iOut] = rule
|
||||
iOut++
|
||||
}
|
||||
rules = rules[:iOut]
|
||||
|
||||
// Matches by index.
|
||||
matches := map[string]int{}
|
||||
out := []Match{}
|
||||
sum := float64(0)
|
||||
|
||||
for _, rule := range rules {
|
||||
idx, ok := matches[rule.Tag]
|
||||
if !ok {
|
||||
idx = len(matches)
|
||||
matches[rule.Tag] = idx
|
||||
out = append(out, Match{Tag: rule.Tag})
|
||||
}
|
||||
out[idx].Confidence += float64(rule.score)
|
||||
sum += float64(rule.score)
|
||||
}
|
||||
|
||||
for i := range out {
|
||||
out[i].Confidence /= sum
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
// ListRules returns rules used in the ruleset sorted by the rules'
|
||||
// FirstCount. This is the number of times the given rule was the best match to
|
||||
// an input.
|
||||
func (t *RuleSet) ListRules() []*Rule {
|
||||
sort.Slice(t.rules, func(i, j int) bool {
|
||||
if t.rules[j].FirstCount != t.rules[i].FirstCount {
|
||||
return t.rules[j].FirstCount < t.rules[i].FirstCount
|
||||
}
|
||||
|
||||
if t.rules[j].MatchCount != t.rules[i].MatchCount {
|
||||
return t.rules[j].MatchCount < t.rules[i].MatchCount
|
||||
}
|
||||
|
||||
return t.rules[j].Tag < t.rules[i].Tag
|
||||
})
|
||||
return t.rules
|
||||
}
|
@ -0,0 +1,84 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRulesSet(t *testing.T) {
|
||||
rs := NewRuleSet()
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc/2",
|
||||
Includes: []string{"cola", "coca"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc/0",
|
||||
Includes: []string{"coca cola"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cz/2",
|
||||
Includes: []string{"coca", "zero"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc0/3",
|
||||
Includes: []string{"zero", "coca", "cola"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc0/3.1",
|
||||
Includes: []string{"coca", "cola", "zero"},
|
||||
Excludes: []string{"pepsi"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "spa",
|
||||
Includes: []string{"spa"},
|
||||
Blocks: []string{"cc/0", "cc0/3", "cc0/3.1"},
|
||||
})
|
||||
|
||||
type TestCase struct {
|
||||
Input string
|
||||
Matches []Match
|
||||
}
|
||||
|
||||
cases := []TestCase{
|
||||
{
|
||||
Input: "coca-cola zero",
|
||||
Matches: []Match{
|
||||
{"cc0/3.1", 0.3},
|
||||
{"cc0/3", 0.3},
|
||||
{"cz/2", 0.2},
|
||||
{"cc/2", 0.2},
|
||||
},
|
||||
}, {
|
||||
Input: "coca cola",
|
||||
Matches: []Match{
|
||||
{"cc/0", 0.6},
|
||||
{"cc/2", 0.4},
|
||||
},
|
||||
}, {
|
||||
Input: "coca cola zero pepsi",
|
||||
Matches: []Match{
|
||||
{"cc0/3", 0.3},
|
||||
{"cc/0", 0.3},
|
||||
{"cz/2", 0.2},
|
||||
{"cc/2", 0.2},
|
||||
},
|
||||
}, {
|
||||
Input: "fanta orange",
|
||||
Matches: []Match{},
|
||||
}, {
|
||||
Input: "coca-cola zero / fanta / spa",
|
||||
Matches: []Match{
|
||||
{"cz/2", 0.4},
|
||||
{"cc/2", 0.4},
|
||||
{"spa", 0.2},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
matches := rs.Match(tc.Input)
|
||||
if !reflect.DeepEqual(matches, tc.Matches) {
|
||||
t.Fatalf("%v != %v", matches, tc.Matches)
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var ignoreTokens = map[string]struct{}{}
|
||||
|
||||
func init() {
|
||||
// These on their own are ignored.
|
||||
tokens := []string{
|
||||
"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
|
||||
`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
|
||||
`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
|
||||
}
|
||||
for _, s := range tokens {
|
||||
ignoreTokens[s] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func Tokenize(
|
||||
input string,
|
||||
maxNgram int,
|
||||
) (
|
||||
tokens []string,
|
||||
) {
|
||||
// Avoid duplicate ngrams.
|
||||
ignored := map[string]bool{}
|
||||
|
||||
fields := strings.Fields(input)
|
||||
|
||||
if len(fields) < maxNgram {
|
||||
maxNgram = len(fields)
|
||||
}
|
||||
|
||||
for i := 1; i < maxNgram+1; i++ {
|
||||
jMax := len(fields) - i + 1
|
||||
|
||||
for j := 0; j < jMax; j++ {
|
||||
ngram := strings.Join(fields[j:i+j], " ")
|
||||
if _, ok := ignoreTokens[ngram]; !ok {
|
||||
if _, ok := ignored[ngram]; !ok {
|
||||
tokens = append(tokens, ngram)
|
||||
ignored[ngram] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sortTokens(tokens)
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
func sortTokens(tokens []string) {
|
||||
sort.Slice(tokens, func(i, j int) bool {
|
||||
if len(tokens[i]) != len(tokens[j]) {
|
||||
return len(tokens[i]) < len(tokens[j])
|
||||
}
|
||||
return tokens[i] < tokens[j]
|
||||
})
|
||||
}
|
@ -0,0 +1,55 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTokenize(t *testing.T) {
|
||||
type Case struct {
|
||||
Input string
|
||||
MaxNgram int
|
||||
Output []string
|
||||
}
|
||||
|
||||
cases := []Case{
|
||||
{
|
||||
Input: "a bb c d",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"a", "c", "d", "bb",
|
||||
"c d", "a bb", "bb c",
|
||||
"a bb c", "bb c d",
|
||||
},
|
||||
}, {
|
||||
Input: "a b",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"a", "b", "a b",
|
||||
},
|
||||
}, {
|
||||
Input: "- b c d",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"b", "c", "d",
|
||||
"- b", "b c", "c d",
|
||||
"- b c", "b c d",
|
||||
},
|
||||
}, {
|
||||
Input: "a a b c d c d",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"a", "b", "c", "d",
|
||||
"a a", "a b", "b c", "c d", "d c",
|
||||
"a a b", "a b c", "b c d", "c d c", "d c d",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
output := Tokenize(tc.Input, tc.MaxNgram)
|
||||
if !reflect.DeepEqual(output, tc.Output) {
|
||||
t.Fatalf("%s: %#v", tc.Input, output)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue