wip
This commit is contained in:
2
tagengine/README.md
Normal file
2
tagengine/README.md
Normal file
@@ -0,0 +1,2 @@
|
||||
# tagengine
|
||||
|
3
tagengine/go.mod
Normal file
3
tagengine/go.mod
Normal file
@@ -0,0 +1,3 @@
|
||||
module git.crumpington.com/lib/tagengine
|
||||
|
||||
go 1.23.2
|
0
tagengine/go.sum
Normal file
0
tagengine/go.sum
Normal file
30
tagengine/ngram.go
Normal file
30
tagengine/ngram.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package tagengine
|
||||
|
||||
import "unicode"
|
||||
|
||||
func ngramLength(s string) int {
|
||||
N := len(s)
|
||||
i := 0
|
||||
count := 0
|
||||
|
||||
for {
|
||||
// Eat spaces.
|
||||
for i < N && unicode.IsSpace(rune(s[i])) {
|
||||
i++
|
||||
}
|
||||
|
||||
// Done?
|
||||
if i == N {
|
||||
break
|
||||
}
|
||||
|
||||
// Non-space!
|
||||
count++
|
||||
|
||||
// Eat non-spaces.
|
||||
for i < N && !unicode.IsSpace(rune(s[i])) {
|
||||
i++
|
||||
}
|
||||
}
|
||||
return count
|
||||
}
|
31
tagengine/ngram_test.go
Normal file
31
tagengine/ngram_test.go
Normal file
@@ -0,0 +1,31 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"log"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestNGramLength(t *testing.T) {
|
||||
type Case struct {
|
||||
Input string
|
||||
Length int
|
||||
}
|
||||
|
||||
cases := []Case{
|
||||
{"a b c", 3},
|
||||
{" xyz\nlkj dflaj a", 4},
|
||||
{"a", 1},
|
||||
{" a", 1},
|
||||
{"a", 1},
|
||||
{" a\n", 1},
|
||||
{" a ", 1},
|
||||
{"\tx\ny\nz q ", 4},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
length := ngramLength(tc.Input)
|
||||
if length != tc.Length {
|
||||
log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
|
||||
}
|
||||
}
|
||||
}
|
79
tagengine/node.go
Normal file
79
tagengine/node.go
Normal file
@@ -0,0 +1,79 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
type node struct {
|
||||
Token string
|
||||
Matches []*Rule // If a list of tokens reaches this node, it matches these.
|
||||
Children map[string]*node
|
||||
}
|
||||
|
||||
func (n *node) AddRule(r *Rule) {
|
||||
n.addRule(r, 0)
|
||||
}
|
||||
|
||||
func (n *node) addRule(r *Rule, idx int) {
|
||||
if len(r.Includes) == idx {
|
||||
n.Matches = append(n.Matches, r)
|
||||
return
|
||||
}
|
||||
|
||||
token := r.Includes[idx]
|
||||
|
||||
child, ok := n.Children[token]
|
||||
if !ok {
|
||||
child = &node{
|
||||
Token: token,
|
||||
Children: map[string]*node{},
|
||||
}
|
||||
n.Children[token] = child
|
||||
}
|
||||
|
||||
child.addRule(r, idx+1)
|
||||
}
|
||||
|
||||
// Note that tokens must be sorted. This is the case for tokens created from
|
||||
// the tokenize function.
|
||||
func (n *node) Match(tokens []string) (rules []*Rule) {
|
||||
return n.match(tokens, rules)
|
||||
}
|
||||
|
||||
func (n *node) match(tokens []string, rules []*Rule) []*Rule {
|
||||
// Check for a match.
|
||||
if n.Matches != nil {
|
||||
rules = append(rules, n.Matches...)
|
||||
}
|
||||
|
||||
if len(tokens) == 0 {
|
||||
return rules
|
||||
}
|
||||
|
||||
// Attempt to match children.
|
||||
for i := 0; i < len(tokens); i++ {
|
||||
token := tokens[i]
|
||||
if child, ok := n.Children[token]; ok {
|
||||
rules = child.match(tokens[i+1:], rules)
|
||||
}
|
||||
}
|
||||
|
||||
return rules
|
||||
}
|
||||
|
||||
func (n *node) Dump() {
|
||||
n.dump(0)
|
||||
}
|
||||
|
||||
func (n *node) dump(depth int) {
|
||||
indent := strings.Repeat(" ", 2*depth)
|
||||
tag := ""
|
||||
for _, m := range n.Matches {
|
||||
tag += " " + m.Tag
|
||||
}
|
||||
fmt.Printf("%s%s%s\n", indent, n.Token, tag)
|
||||
for _, child := range n.Children {
|
||||
child.dump(depth + 1)
|
||||
}
|
||||
}
|
159
tagengine/rule.go
Normal file
159
tagengine/rule.go
Normal file
@@ -0,0 +1,159 @@
|
||||
package tagengine
|
||||
|
||||
type Rule struct {
|
||||
// The purpose of a Rule is to attach it's Tag to matching text.
|
||||
Tag string
|
||||
|
||||
// Includes is a list of strings that must be found in the input in order to
|
||||
// match.
|
||||
Includes []string
|
||||
|
||||
// Excludes is a list of strings that can exclude a match for this rule.
|
||||
Excludes []string
|
||||
|
||||
// Blocks: If this rule is matched, then it will block matches of any tags
|
||||
// listed here.
|
||||
Blocks []string
|
||||
|
||||
// The Score encodes the complexity of the Rule. A higher score indicates a
|
||||
// more specific match. A Rule more includes, or includes with multiple words
|
||||
// should havee a higher Score than a Rule with fewer includes or less
|
||||
// complex includes.
|
||||
Score int
|
||||
|
||||
excludes map[string]struct{}
|
||||
}
|
||||
|
||||
func NewRule(tag string) Rule {
|
||||
return Rule{Tag: tag}
|
||||
}
|
||||
|
||||
func (r Rule) Inc(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: append(r.Includes, l...),
|
||||
Excludes: r.Excludes,
|
||||
Blocks: r.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (r Rule) Exc(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: r.Includes,
|
||||
Excludes: append(r.Excludes, l...),
|
||||
Blocks: r.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (r Rule) Block(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: r.Includes,
|
||||
Excludes: r.Excludes,
|
||||
Blocks: append(r.Blocks, l...),
|
||||
}
|
||||
}
|
||||
|
||||
func (rule *Rule) normalize(sanitize func(string) string) {
|
||||
for i, token := range rule.Includes {
|
||||
rule.Includes[i] = sanitize(token)
|
||||
}
|
||||
for i, token := range rule.Excludes {
|
||||
rule.Excludes[i] = sanitize(token)
|
||||
}
|
||||
|
||||
sortTokens(rule.Includes)
|
||||
sortTokens(rule.Excludes)
|
||||
|
||||
rule.excludes = map[string]struct{}{}
|
||||
for _, s := range rule.Excludes {
|
||||
rule.excludes[s] = struct{}{}
|
||||
}
|
||||
|
||||
rule.Score = rule.computeScore()
|
||||
}
|
||||
|
||||
func (r Rule) maxNGram() int {
|
||||
max := 0
|
||||
for _, s := range r.Includes {
|
||||
n := ngramLength(s)
|
||||
if n > max {
|
||||
max = n
|
||||
}
|
||||
}
|
||||
for _, s := range r.Excludes {
|
||||
n := ngramLength(s)
|
||||
if n > max {
|
||||
max = n
|
||||
}
|
||||
}
|
||||
|
||||
return max
|
||||
}
|
||||
|
||||
func (r Rule) isExcluded(tokens []string) bool {
|
||||
// This is most often the case.
|
||||
if len(r.excludes) == 0 {
|
||||
return false
|
||||
}
|
||||
|
||||
for _, s := range tokens {
|
||||
if _, ok := r.excludes[s]; ok {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func (r Rule) computeScore() (score int) {
|
||||
for _, token := range r.Includes {
|
||||
n := ngramLength(token)
|
||||
score += n * (n + 1) / 2
|
||||
}
|
||||
return score
|
||||
}
|
||||
|
||||
func ruleLess(lhs, rhs *Rule) bool {
|
||||
// If scores differ, sort by score.
|
||||
if lhs.Score != rhs.Score {
|
||||
return lhs.Score < rhs.Score
|
||||
}
|
||||
|
||||
// If include depth differs, sort by depth.
|
||||
lDepth := len(lhs.Includes)
|
||||
rDepth := len(rhs.Includes)
|
||||
|
||||
if lDepth != rDepth {
|
||||
return lDepth < rDepth
|
||||
}
|
||||
|
||||
// If exclude depth differs, sort by depth.
|
||||
lDepth = len(lhs.Excludes)
|
||||
rDepth = len(rhs.Excludes)
|
||||
|
||||
if lDepth != rDepth {
|
||||
return lDepth < rDepth
|
||||
}
|
||||
|
||||
// Sort alphabetically by includes.
|
||||
for i := range lhs.Includes {
|
||||
if lhs.Includes[i] != rhs.Includes[i] {
|
||||
return lhs.Includes[i] < rhs.Includes[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by alphabetically by excludes.
|
||||
for i := range lhs.Excludes {
|
||||
if lhs.Excludes[i] != rhs.Excludes[i] {
|
||||
return lhs.Excludes[i] < rhs.Excludes[i]
|
||||
}
|
||||
}
|
||||
|
||||
// Sort by tag.
|
||||
if lhs.Tag != rhs.Tag {
|
||||
return lhs.Tag < rhs.Tag
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
58
tagengine/rulegroup.go
Normal file
58
tagengine/rulegroup.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package tagengine
|
||||
|
||||
// A RuleGroup can be converted into a list of rules. Each rule will point to
|
||||
// the same tag, and have the same exclude set and blocks.
|
||||
type RuleGroup struct {
|
||||
Tag string
|
||||
Includes [][]string
|
||||
Excludes []string
|
||||
Blocks []string
|
||||
}
|
||||
|
||||
func NewRuleGroup(tag string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: tag,
|
||||
Includes: [][]string{},
|
||||
Excludes: []string{},
|
||||
Blocks: []string{},
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) Inc(l ...string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: append(g.Includes, l),
|
||||
Excludes: g.Excludes,
|
||||
Blocks: g.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) Exc(l ...string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: g.Includes,
|
||||
Excludes: append(g.Excludes, l...),
|
||||
Blocks: g.Blocks,
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) Block(l ...string) RuleGroup {
|
||||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: g.Includes,
|
||||
Excludes: g.Excludes,
|
||||
Blocks: append(g.Blocks, l...),
|
||||
}
|
||||
}
|
||||
|
||||
func (g RuleGroup) ToList() (l []Rule) {
|
||||
for _, includes := range g.Includes {
|
||||
l = append(l, Rule{
|
||||
Tag: g.Tag,
|
||||
Excludes: g.Excludes,
|
||||
Includes: includes,
|
||||
Blocks: g.Blocks,
|
||||
})
|
||||
}
|
||||
return
|
||||
}
|
162
tagengine/ruleset.go
Normal file
162
tagengine/ruleset.go
Normal file
@@ -0,0 +1,162 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"sort"
|
||||
)
|
||||
|
||||
type RuleSet struct {
|
||||
root *node
|
||||
maxNgram int
|
||||
sanitize func(string) string
|
||||
rules []*Rule
|
||||
}
|
||||
|
||||
func NewRuleSet() *RuleSet {
|
||||
return &RuleSet{
|
||||
root: &node{
|
||||
Token: "/",
|
||||
Children: map[string]*node{},
|
||||
},
|
||||
sanitize: BasicSanitizer,
|
||||
rules: []*Rule{},
|
||||
}
|
||||
}
|
||||
|
||||
func NewRuleSetFromList(rules []Rule) *RuleSet {
|
||||
rs := NewRuleSet()
|
||||
rs.AddRule(rules...)
|
||||
return rs
|
||||
}
|
||||
|
||||
func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
|
||||
for _, ix := range ruleOrGroup {
|
||||
switch x := ix.(type) {
|
||||
case Rule:
|
||||
t.AddRule(x)
|
||||
case RuleGroup:
|
||||
t.AddRuleGroup(x)
|
||||
default:
|
||||
panic("Add expects either Rule or RuleGroup objects.")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (t *RuleSet) AddRule(rules ...Rule) {
|
||||
for _, rule := range rules {
|
||||
rule := rule
|
||||
|
||||
// Make sure rule is well-formed.
|
||||
rule.normalize(t.sanitize)
|
||||
|
||||
// Update maxNgram.
|
||||
N := rule.maxNGram()
|
||||
if N > t.maxNgram {
|
||||
t.maxNgram = N
|
||||
}
|
||||
|
||||
t.rules = append(t.rules, &rule)
|
||||
t.root.AddRule(&rule)
|
||||
}
|
||||
}
|
||||
|
||||
func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
|
||||
for _, rg := range ruleGroups {
|
||||
t.AddRule(rg.ToList()...)
|
||||
}
|
||||
}
|
||||
|
||||
// MatchRules will return a list of all matching rules. The rules are sorted by
|
||||
// the match's Score. The best match will be first.
|
||||
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
||||
input = t.sanitize(input)
|
||||
tokens := Tokenize(input, t.maxNgram)
|
||||
|
||||
rules = t.root.Match(tokens)
|
||||
if len(rules) == 0 {
|
||||
return rules
|
||||
}
|
||||
|
||||
// Check excludes.
|
||||
l := rules[:0]
|
||||
for _, r := range rules {
|
||||
if !r.isExcluded(tokens) {
|
||||
l = append(l, r)
|
||||
}
|
||||
}
|
||||
|
||||
rules = l
|
||||
|
||||
// Sort rules descending.
|
||||
sort.Slice(rules, func(i, j int) bool {
|
||||
return ruleLess(rules[j], rules[i])
|
||||
})
|
||||
|
||||
return rules
|
||||
}
|
||||
|
||||
type Match struct {
|
||||
Tag string
|
||||
|
||||
// Confidence is used to sort all matches, and is normalized so the sum of
|
||||
// Confidence values for all matches is 1. Confidence is relative to the
|
||||
// number of matches and the size of matches in terms of number of tokens.
|
||||
Confidence float64 // In the range (0,1].
|
||||
}
|
||||
|
||||
// Return a list of matches with confidence. This is useful if you'd like to
|
||||
// find the best matching rule out of all the matched rules.
|
||||
//
|
||||
// If you just want to find all matching rules, then use MatchRules.
|
||||
func (t *RuleSet) Match(input string) []Match {
|
||||
rules := t.MatchRules(input)
|
||||
if len(rules) == 0 {
|
||||
return []Match{}
|
||||
}
|
||||
if len(rules) == 1 {
|
||||
return []Match{{
|
||||
Tag: rules[0].Tag,
|
||||
Confidence: 1,
|
||||
}}
|
||||
}
|
||||
|
||||
// Create list of blocked tags.
|
||||
blocks := map[string]struct{}{}
|
||||
for _, rule := range rules {
|
||||
for _, tag := range rule.Blocks {
|
||||
blocks[tag] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove rules for blocked tags.
|
||||
iOut := 0
|
||||
for _, rule := range rules {
|
||||
if _, ok := blocks[rule.Tag]; ok {
|
||||
continue
|
||||
}
|
||||
rules[iOut] = rule
|
||||
iOut++
|
||||
}
|
||||
rules = rules[:iOut]
|
||||
|
||||
// Matches by index.
|
||||
matches := map[string]int{}
|
||||
out := []Match{}
|
||||
sum := float64(0)
|
||||
|
||||
for _, rule := range rules {
|
||||
idx, ok := matches[rule.Tag]
|
||||
if !ok {
|
||||
idx = len(matches)
|
||||
matches[rule.Tag] = idx
|
||||
out = append(out, Match{Tag: rule.Tag})
|
||||
}
|
||||
out[idx].Confidence += float64(rule.Score)
|
||||
sum += float64(rule.Score)
|
||||
}
|
||||
|
||||
for i := range out {
|
||||
out[i].Confidence /= sum
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
84
tagengine/ruleset_test.go
Normal file
84
tagengine/ruleset_test.go
Normal file
@@ -0,0 +1,84 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestRulesSet(t *testing.T) {
|
||||
rs := NewRuleSet()
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc/2",
|
||||
Includes: []string{"cola", "coca"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc/0",
|
||||
Includes: []string{"coca cola"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cz/2",
|
||||
Includes: []string{"coca", "zero"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc0/3",
|
||||
Includes: []string{"zero", "coca", "cola"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "cc0/3.1",
|
||||
Includes: []string{"coca", "cola", "zero"},
|
||||
Excludes: []string{"pepsi"},
|
||||
})
|
||||
rs.AddRule(Rule{
|
||||
Tag: "spa",
|
||||
Includes: []string{"spa"},
|
||||
Blocks: []string{"cc/0", "cc0/3", "cc0/3.1"},
|
||||
})
|
||||
|
||||
type TestCase struct {
|
||||
Input string
|
||||
Matches []Match
|
||||
}
|
||||
|
||||
cases := []TestCase{
|
||||
{
|
||||
Input: "coca-cola zero",
|
||||
Matches: []Match{
|
||||
{"cc0/3.1", 0.3},
|
||||
{"cc0/3", 0.3},
|
||||
{"cz/2", 0.2},
|
||||
{"cc/2", 0.2},
|
||||
},
|
||||
}, {
|
||||
Input: "coca cola",
|
||||
Matches: []Match{
|
||||
{"cc/0", 0.6},
|
||||
{"cc/2", 0.4},
|
||||
},
|
||||
}, {
|
||||
Input: "coca cola zero pepsi",
|
||||
Matches: []Match{
|
||||
{"cc0/3", 0.3},
|
||||
{"cc/0", 0.3},
|
||||
{"cz/2", 0.2},
|
||||
{"cc/2", 0.2},
|
||||
},
|
||||
}, {
|
||||
Input: "fanta orange",
|
||||
Matches: []Match{},
|
||||
}, {
|
||||
Input: "coca-cola zero / fanta / spa",
|
||||
Matches: []Match{
|
||||
{"cz/2", 0.4},
|
||||
{"cc/2", 0.4},
|
||||
{"spa", 0.2},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
matches := rs.Match(tc.Input)
|
||||
if !reflect.DeepEqual(matches, tc.Matches) {
|
||||
t.Fatalf("%v != %v", matches, tc.Matches)
|
||||
}
|
||||
}
|
||||
}
|
20
tagengine/sanitize.go
Normal file
20
tagengine/sanitize.go
Normal file
@@ -0,0 +1,20 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"git.crumpington.com/lib/tagengine/sanitize"
|
||||
)
|
||||
|
||||
// The basic sanitizer:
|
||||
// * lower-case
|
||||
// * put spaces around numbers
|
||||
// * put slaces around punctuation
|
||||
// * collapse multiple spaces
|
||||
func BasicSanitizer(s string) string {
|
||||
s = strings.ToLower(s)
|
||||
s = sanitize.SpaceNumbers(s)
|
||||
s = sanitize.SpacePunctuation(s)
|
||||
s = sanitize.CollapseSpaces(s)
|
||||
return s
|
||||
}
|
91
tagengine/sanitize/sanitize.go
Normal file
91
tagengine/sanitize/sanitize.go
Normal file
@@ -0,0 +1,91 @@
|
||||
package sanitize
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
func SpaceNumbers(s string) string {
|
||||
if len(s) == 0 {
|
||||
return s
|
||||
}
|
||||
|
||||
isDigit := func(b rune) bool {
|
||||
switch b {
|
||||
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
b := strings.Builder{}
|
||||
|
||||
var first rune
|
||||
for _, c := range s {
|
||||
first = c
|
||||
break
|
||||
}
|
||||
|
||||
digit := isDigit(first)
|
||||
|
||||
// Range over runes.
|
||||
for _, c := range s {
|
||||
thisDigit := isDigit(c)
|
||||
if thisDigit != digit {
|
||||
b.WriteByte(' ')
|
||||
digit = thisDigit
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func SpacePunctuation(s string) string {
|
||||
needsSpace := func(r rune) bool {
|
||||
switch r {
|
||||
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
|
||||
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
|
||||
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
b := strings.Builder{}
|
||||
|
||||
// Range over runes.
|
||||
for _, r := range s {
|
||||
if needsSpace(r) {
|
||||
b.WriteRune(' ')
|
||||
b.WriteRune(r)
|
||||
b.WriteRune(' ')
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func CollapseSpaces(s string) string {
|
||||
// Trim leading and trailing spaces.
|
||||
s = strings.TrimSpace(s)
|
||||
|
||||
b := strings.Builder{}
|
||||
wasSpace := false
|
||||
|
||||
// Range over runes.
|
||||
for _, c := range s {
|
||||
if unicode.IsSpace(c) {
|
||||
wasSpace = true
|
||||
continue
|
||||
} else if wasSpace {
|
||||
wasSpace = false
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
30
tagengine/sanitize_test.go
Normal file
30
tagengine/sanitize_test.go
Normal file
@@ -0,0 +1,30 @@
|
||||
package tagengine
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestSanitize(t *testing.T) {
|
||||
sanitize := BasicSanitizer
|
||||
|
||||
type Case struct {
|
||||
In string
|
||||
Out string
|
||||
}
|
||||
|
||||
cases := []Case{
|
||||
{"", ""},
|
||||
{"123abc", "123 abc"},
|
||||
{"abc123", "abc 123"},
|
||||
{"abc123xyz", "abc 123 xyz"},
|
||||
{"1f2", "1 f 2"},
|
||||
{" abc", "abc"},
|
||||
{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
|
||||
{"€", "€"},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
out := sanitize(tc.In)
|
||||
if out != tc.Out {
|
||||
t.Fatalf("%v != %v", out, tc.Out)
|
||||
}
|
||||
}
|
||||
}
|
63
tagengine/tokenize.go
Normal file
63
tagengine/tokenize.go
Normal file
@@ -0,0 +1,63 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"sort"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var ignoreTokens = map[string]struct{}{}
|
||||
|
||||
func init() {
|
||||
// These on their own are ignored.
|
||||
tokens := []string{
|
||||
"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
|
||||
`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
|
||||
`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
|
||||
}
|
||||
for _, s := range tokens {
|
||||
ignoreTokens[s] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
func Tokenize(
|
||||
input string,
|
||||
maxNgram int,
|
||||
) (
|
||||
tokens []string,
|
||||
) {
|
||||
// Avoid duplicate ngrams.
|
||||
ignored := map[string]bool{}
|
||||
|
||||
fields := strings.Fields(input)
|
||||
|
||||
if len(fields) < maxNgram {
|
||||
maxNgram = len(fields)
|
||||
}
|
||||
|
||||
for i := 1; i < maxNgram+1; i++ {
|
||||
jMax := len(fields) - i + 1
|
||||
|
||||
for j := 0; j < jMax; j++ {
|
||||
ngram := strings.Join(fields[j:i+j], " ")
|
||||
if _, ok := ignoreTokens[ngram]; !ok {
|
||||
if _, ok := ignored[ngram]; !ok {
|
||||
tokens = append(tokens, ngram)
|
||||
ignored[ngram] = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sortTokens(tokens)
|
||||
|
||||
return tokens
|
||||
}
|
||||
|
||||
func sortTokens(tokens []string) {
|
||||
sort.Slice(tokens, func(i, j int) bool {
|
||||
if len(tokens[i]) != len(tokens[j]) {
|
||||
return len(tokens[i]) < len(tokens[j])
|
||||
}
|
||||
return tokens[i] < tokens[j]
|
||||
})
|
||||
}
|
55
tagengine/tokenize_test.go
Normal file
55
tagengine/tokenize_test.go
Normal file
@@ -0,0 +1,55 @@
|
||||
package tagengine
|
||||
|
||||
import (
|
||||
"reflect"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestTokenize(t *testing.T) {
|
||||
type Case struct {
|
||||
Input string
|
||||
MaxNgram int
|
||||
Output []string
|
||||
}
|
||||
|
||||
cases := []Case{
|
||||
{
|
||||
Input: "a bb c d",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"a", "c", "d", "bb",
|
||||
"c d", "a bb", "bb c",
|
||||
"a bb c", "bb c d",
|
||||
},
|
||||
}, {
|
||||
Input: "a b",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"a", "b", "a b",
|
||||
},
|
||||
}, {
|
||||
Input: "- b c d",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"b", "c", "d",
|
||||
"- b", "b c", "c d",
|
||||
"- b c", "b c d",
|
||||
},
|
||||
}, {
|
||||
Input: "a a b c d c d",
|
||||
MaxNgram: 3,
|
||||
Output: []string{
|
||||
"a", "b", "c", "d",
|
||||
"a a", "a b", "b c", "c d", "d c",
|
||||
"a a b", "a b c", "b c d", "c d c", "d c d",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range cases {
|
||||
output := Tokenize(tc.Input, tc.MaxNgram)
|
||||
if !reflect.DeepEqual(output, tc.Output) {
|
||||
t.Fatalf("%s: %#v", tc.Input, output)
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user