This commit is contained in:
jdl
2024-11-11 06:36:55 +01:00
parent d0587cc585
commit c5419d662e
102 changed files with 4181 additions and 0 deletions

2
tagengine/README.md Normal file
View File

@@ -0,0 +1,2 @@
# tagengine

3
tagengine/go.mod Normal file
View File

@@ -0,0 +1,3 @@
module git.crumpington.com/lib/tagengine
go 1.23.2

0
tagengine/go.sum Normal file
View File

30
tagengine/ngram.go Normal file
View File

@@ -0,0 +1,30 @@
package tagengine
import "unicode"
func ngramLength(s string) int {
N := len(s)
i := 0
count := 0
for {
// Eat spaces.
for i < N && unicode.IsSpace(rune(s[i])) {
i++
}
// Done?
if i == N {
break
}
// Non-space!
count++
// Eat non-spaces.
for i < N && !unicode.IsSpace(rune(s[i])) {
i++
}
}
return count
}

31
tagengine/ngram_test.go Normal file
View File

@@ -0,0 +1,31 @@
package tagengine
import (
"log"
"testing"
)
func TestNGramLength(t *testing.T) {
type Case struct {
Input string
Length int
}
cases := []Case{
{"a b c", 3},
{" xyz\nlkj dflaj a", 4},
{"a", 1},
{" a", 1},
{"a", 1},
{" a\n", 1},
{" a ", 1},
{"\tx\ny\nz q ", 4},
}
for _, tc := range cases {
length := ngramLength(tc.Input)
if length != tc.Length {
log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
}
}
}

79
tagengine/node.go Normal file
View File

@@ -0,0 +1,79 @@
package tagengine
import (
"fmt"
"strings"
)
type node struct {
Token string
Matches []*Rule // If a list of tokens reaches this node, it matches these.
Children map[string]*node
}
func (n *node) AddRule(r *Rule) {
n.addRule(r, 0)
}
func (n *node) addRule(r *Rule, idx int) {
if len(r.Includes) == idx {
n.Matches = append(n.Matches, r)
return
}
token := r.Includes[idx]
child, ok := n.Children[token]
if !ok {
child = &node{
Token: token,
Children: map[string]*node{},
}
n.Children[token] = child
}
child.addRule(r, idx+1)
}
// Note that tokens must be sorted. This is the case for tokens created from
// the tokenize function.
func (n *node) Match(tokens []string) (rules []*Rule) {
return n.match(tokens, rules)
}
func (n *node) match(tokens []string, rules []*Rule) []*Rule {
// Check for a match.
if n.Matches != nil {
rules = append(rules, n.Matches...)
}
if len(tokens) == 0 {
return rules
}
// Attempt to match children.
for i := 0; i < len(tokens); i++ {
token := tokens[i]
if child, ok := n.Children[token]; ok {
rules = child.match(tokens[i+1:], rules)
}
}
return rules
}
func (n *node) Dump() {
n.dump(0)
}
func (n *node) dump(depth int) {
indent := strings.Repeat(" ", 2*depth)
tag := ""
for _, m := range n.Matches {
tag += " " + m.Tag
}
fmt.Printf("%s%s%s\n", indent, n.Token, tag)
for _, child := range n.Children {
child.dump(depth + 1)
}
}

159
tagengine/rule.go Normal file
View File

@@ -0,0 +1,159 @@
package tagengine
type Rule struct {
// The purpose of a Rule is to attach it's Tag to matching text.
Tag string
// Includes is a list of strings that must be found in the input in order to
// match.
Includes []string
// Excludes is a list of strings that can exclude a match for this rule.
Excludes []string
// Blocks: If this rule is matched, then it will block matches of any tags
// listed here.
Blocks []string
// The Score encodes the complexity of the Rule. A higher score indicates a
// more specific match. A Rule more includes, or includes with multiple words
// should havee a higher Score than a Rule with fewer includes or less
// complex includes.
Score int
excludes map[string]struct{}
}
func NewRule(tag string) Rule {
return Rule{Tag: tag}
}
func (r Rule) Inc(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: append(r.Includes, l...),
Excludes: r.Excludes,
Blocks: r.Blocks,
}
}
func (r Rule) Exc(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: r.Includes,
Excludes: append(r.Excludes, l...),
Blocks: r.Blocks,
}
}
func (r Rule) Block(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: r.Includes,
Excludes: r.Excludes,
Blocks: append(r.Blocks, l...),
}
}
func (rule *Rule) normalize(sanitize func(string) string) {
for i, token := range rule.Includes {
rule.Includes[i] = sanitize(token)
}
for i, token := range rule.Excludes {
rule.Excludes[i] = sanitize(token)
}
sortTokens(rule.Includes)
sortTokens(rule.Excludes)
rule.excludes = map[string]struct{}{}
for _, s := range rule.Excludes {
rule.excludes[s] = struct{}{}
}
rule.Score = rule.computeScore()
}
func (r Rule) maxNGram() int {
max := 0
for _, s := range r.Includes {
n := ngramLength(s)
if n > max {
max = n
}
}
for _, s := range r.Excludes {
n := ngramLength(s)
if n > max {
max = n
}
}
return max
}
func (r Rule) isExcluded(tokens []string) bool {
// This is most often the case.
if len(r.excludes) == 0 {
return false
}
for _, s := range tokens {
if _, ok := r.excludes[s]; ok {
return true
}
}
return false
}
func (r Rule) computeScore() (score int) {
for _, token := range r.Includes {
n := ngramLength(token)
score += n * (n + 1) / 2
}
return score
}
func ruleLess(lhs, rhs *Rule) bool {
// If scores differ, sort by score.
if lhs.Score != rhs.Score {
return lhs.Score < rhs.Score
}
// If include depth differs, sort by depth.
lDepth := len(lhs.Includes)
rDepth := len(rhs.Includes)
if lDepth != rDepth {
return lDepth < rDepth
}
// If exclude depth differs, sort by depth.
lDepth = len(lhs.Excludes)
rDepth = len(rhs.Excludes)
if lDepth != rDepth {
return lDepth < rDepth
}
// Sort alphabetically by includes.
for i := range lhs.Includes {
if lhs.Includes[i] != rhs.Includes[i] {
return lhs.Includes[i] < rhs.Includes[i]
}
}
// Sort by alphabetically by excludes.
for i := range lhs.Excludes {
if lhs.Excludes[i] != rhs.Excludes[i] {
return lhs.Excludes[i] < rhs.Excludes[i]
}
}
// Sort by tag.
if lhs.Tag != rhs.Tag {
return lhs.Tag < rhs.Tag
}
return false
}

58
tagengine/rulegroup.go Normal file
View File

@@ -0,0 +1,58 @@
package tagengine
// A RuleGroup can be converted into a list of rules. Each rule will point to
// the same tag, and have the same exclude set and blocks.
type RuleGroup struct {
Tag string
Includes [][]string
Excludes []string
Blocks []string
}
func NewRuleGroup(tag string) RuleGroup {
return RuleGroup{
Tag: tag,
Includes: [][]string{},
Excludes: []string{},
Blocks: []string{},
}
}
func (g RuleGroup) Inc(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: append(g.Includes, l),
Excludes: g.Excludes,
Blocks: g.Blocks,
}
}
func (g RuleGroup) Exc(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: g.Includes,
Excludes: append(g.Excludes, l...),
Blocks: g.Blocks,
}
}
func (g RuleGroup) Block(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: g.Includes,
Excludes: g.Excludes,
Blocks: append(g.Blocks, l...),
}
}
func (g RuleGroup) ToList() (l []Rule) {
for _, includes := range g.Includes {
l = append(l, Rule{
Tag: g.Tag,
Excludes: g.Excludes,
Includes: includes,
Blocks: g.Blocks,
})
}
return
}

162
tagengine/ruleset.go Normal file
View File

@@ -0,0 +1,162 @@
package tagengine
import (
"sort"
)
type RuleSet struct {
root *node
maxNgram int
sanitize func(string) string
rules []*Rule
}
func NewRuleSet() *RuleSet {
return &RuleSet{
root: &node{
Token: "/",
Children: map[string]*node{},
},
sanitize: BasicSanitizer,
rules: []*Rule{},
}
}
func NewRuleSetFromList(rules []Rule) *RuleSet {
rs := NewRuleSet()
rs.AddRule(rules...)
return rs
}
func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
for _, ix := range ruleOrGroup {
switch x := ix.(type) {
case Rule:
t.AddRule(x)
case RuleGroup:
t.AddRuleGroup(x)
default:
panic("Add expects either Rule or RuleGroup objects.")
}
}
}
func (t *RuleSet) AddRule(rules ...Rule) {
for _, rule := range rules {
rule := rule
// Make sure rule is well-formed.
rule.normalize(t.sanitize)
// Update maxNgram.
N := rule.maxNGram()
if N > t.maxNgram {
t.maxNgram = N
}
t.rules = append(t.rules, &rule)
t.root.AddRule(&rule)
}
}
func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
for _, rg := range ruleGroups {
t.AddRule(rg.ToList()...)
}
}
// MatchRules will return a list of all matching rules. The rules are sorted by
// the match's Score. The best match will be first.
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
input = t.sanitize(input)
tokens := Tokenize(input, t.maxNgram)
rules = t.root.Match(tokens)
if len(rules) == 0 {
return rules
}
// Check excludes.
l := rules[:0]
for _, r := range rules {
if !r.isExcluded(tokens) {
l = append(l, r)
}
}
rules = l
// Sort rules descending.
sort.Slice(rules, func(i, j int) bool {
return ruleLess(rules[j], rules[i])
})
return rules
}
type Match struct {
Tag string
// Confidence is used to sort all matches, and is normalized so the sum of
// Confidence values for all matches is 1. Confidence is relative to the
// number of matches and the size of matches in terms of number of tokens.
Confidence float64 // In the range (0,1].
}
// Return a list of matches with confidence. This is useful if you'd like to
// find the best matching rule out of all the matched rules.
//
// If you just want to find all matching rules, then use MatchRules.
func (t *RuleSet) Match(input string) []Match {
rules := t.MatchRules(input)
if len(rules) == 0 {
return []Match{}
}
if len(rules) == 1 {
return []Match{{
Tag: rules[0].Tag,
Confidence: 1,
}}
}
// Create list of blocked tags.
blocks := map[string]struct{}{}
for _, rule := range rules {
for _, tag := range rule.Blocks {
blocks[tag] = struct{}{}
}
}
// Remove rules for blocked tags.
iOut := 0
for _, rule := range rules {
if _, ok := blocks[rule.Tag]; ok {
continue
}
rules[iOut] = rule
iOut++
}
rules = rules[:iOut]
// Matches by index.
matches := map[string]int{}
out := []Match{}
sum := float64(0)
for _, rule := range rules {
idx, ok := matches[rule.Tag]
if !ok {
idx = len(matches)
matches[rule.Tag] = idx
out = append(out, Match{Tag: rule.Tag})
}
out[idx].Confidence += float64(rule.Score)
sum += float64(rule.Score)
}
for i := range out {
out[i].Confidence /= sum
}
return out
}

84
tagengine/ruleset_test.go Normal file
View File

@@ -0,0 +1,84 @@
package tagengine
import (
"reflect"
"testing"
)
func TestRulesSet(t *testing.T) {
rs := NewRuleSet()
rs.AddRule(Rule{
Tag: "cc/2",
Includes: []string{"cola", "coca"},
})
rs.AddRule(Rule{
Tag: "cc/0",
Includes: []string{"coca cola"},
})
rs.AddRule(Rule{
Tag: "cz/2",
Includes: []string{"coca", "zero"},
})
rs.AddRule(Rule{
Tag: "cc0/3",
Includes: []string{"zero", "coca", "cola"},
})
rs.AddRule(Rule{
Tag: "cc0/3.1",
Includes: []string{"coca", "cola", "zero"},
Excludes: []string{"pepsi"},
})
rs.AddRule(Rule{
Tag: "spa",
Includes: []string{"spa"},
Blocks: []string{"cc/0", "cc0/3", "cc0/3.1"},
})
type TestCase struct {
Input string
Matches []Match
}
cases := []TestCase{
{
Input: "coca-cola zero",
Matches: []Match{
{"cc0/3.1", 0.3},
{"cc0/3", 0.3},
{"cz/2", 0.2},
{"cc/2", 0.2},
},
}, {
Input: "coca cola",
Matches: []Match{
{"cc/0", 0.6},
{"cc/2", 0.4},
},
}, {
Input: "coca cola zero pepsi",
Matches: []Match{
{"cc0/3", 0.3},
{"cc/0", 0.3},
{"cz/2", 0.2},
{"cc/2", 0.2},
},
}, {
Input: "fanta orange",
Matches: []Match{},
}, {
Input: "coca-cola zero / fanta / spa",
Matches: []Match{
{"cz/2", 0.4},
{"cc/2", 0.4},
{"spa", 0.2},
},
},
}
for _, tc := range cases {
matches := rs.Match(tc.Input)
if !reflect.DeepEqual(matches, tc.Matches) {
t.Fatalf("%v != %v", matches, tc.Matches)
}
}
}

20
tagengine/sanitize.go Normal file
View File

@@ -0,0 +1,20 @@
package tagengine
import (
"strings"
"git.crumpington.com/lib/tagengine/sanitize"
)
// The basic sanitizer:
// * lower-case
// * put spaces around numbers
// * put slaces around punctuation
// * collapse multiple spaces
func BasicSanitizer(s string) string {
s = strings.ToLower(s)
s = sanitize.SpaceNumbers(s)
s = sanitize.SpacePunctuation(s)
s = sanitize.CollapseSpaces(s)
return s
}

View File

@@ -0,0 +1,91 @@
package sanitize
import (
"strings"
"unicode"
)
func SpaceNumbers(s string) string {
if len(s) == 0 {
return s
}
isDigit := func(b rune) bool {
switch b {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
}
return false
}
b := strings.Builder{}
var first rune
for _, c := range s {
first = c
break
}
digit := isDigit(first)
// Range over runes.
for _, c := range s {
thisDigit := isDigit(c)
if thisDigit != digit {
b.WriteByte(' ')
digit = thisDigit
}
b.WriteRune(c)
}
return b.String()
}
func SpacePunctuation(s string) string {
needsSpace := func(r rune) bool {
switch r {
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
return true
}
return false
}
b := strings.Builder{}
// Range over runes.
for _, r := range s {
if needsSpace(r) {
b.WriteRune(' ')
b.WriteRune(r)
b.WriteRune(' ')
} else {
b.WriteRune(r)
}
}
return b.String()
}
func CollapseSpaces(s string) string {
// Trim leading and trailing spaces.
s = strings.TrimSpace(s)
b := strings.Builder{}
wasSpace := false
// Range over runes.
for _, c := range s {
if unicode.IsSpace(c) {
wasSpace = true
continue
} else if wasSpace {
wasSpace = false
b.WriteRune(' ')
}
b.WriteRune(c)
}
return b.String()
}

View File

@@ -0,0 +1,30 @@
package tagengine
import "testing"
func TestSanitize(t *testing.T) {
sanitize := BasicSanitizer
type Case struct {
In string
Out string
}
cases := []Case{
{"", ""},
{"123abc", "123 abc"},
{"abc123", "abc 123"},
{"abc123xyz", "abc 123 xyz"},
{"1f2", "1 f 2"},
{" abc", "abc"},
{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
{"€", "€"},
}
for _, tc := range cases {
out := sanitize(tc.In)
if out != tc.Out {
t.Fatalf("%v != %v", out, tc.Out)
}
}
}

63
tagengine/tokenize.go Normal file
View File

@@ -0,0 +1,63 @@
package tagengine
import (
"sort"
"strings"
)
var ignoreTokens = map[string]struct{}{}
func init() {
// These on their own are ignored.
tokens := []string{
"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
}
for _, s := range tokens {
ignoreTokens[s] = struct{}{}
}
}
func Tokenize(
input string,
maxNgram int,
) (
tokens []string,
) {
// Avoid duplicate ngrams.
ignored := map[string]bool{}
fields := strings.Fields(input)
if len(fields) < maxNgram {
maxNgram = len(fields)
}
for i := 1; i < maxNgram+1; i++ {
jMax := len(fields) - i + 1
for j := 0; j < jMax; j++ {
ngram := strings.Join(fields[j:i+j], " ")
if _, ok := ignoreTokens[ngram]; !ok {
if _, ok := ignored[ngram]; !ok {
tokens = append(tokens, ngram)
ignored[ngram] = true
}
}
}
}
sortTokens(tokens)
return tokens
}
func sortTokens(tokens []string) {
sort.Slice(tokens, func(i, j int) bool {
if len(tokens[i]) != len(tokens[j]) {
return len(tokens[i]) < len(tokens[j])
}
return tokens[i] < tokens[j]
})
}

View File

@@ -0,0 +1,55 @@
package tagengine
import (
"reflect"
"testing"
)
func TestTokenize(t *testing.T) {
type Case struct {
Input string
MaxNgram int
Output []string
}
cases := []Case{
{
Input: "a bb c d",
MaxNgram: 3,
Output: []string{
"a", "c", "d", "bb",
"c d", "a bb", "bb c",
"a bb c", "bb c d",
},
}, {
Input: "a b",
MaxNgram: 3,
Output: []string{
"a", "b", "a b",
},
}, {
Input: "- b c d",
MaxNgram: 3,
Output: []string{
"b", "c", "d",
"- b", "b c", "c d",
"- b c", "b c d",
},
}, {
Input: "a a b c d c d",
MaxNgram: 3,
Output: []string{
"a", "b", "c", "d",
"a a", "a b", "b c", "c d", "d c",
"a a b", "a b c", "b c d", "c d c", "d c d",
},
},
}
for _, tc := range cases {
output := Tokenize(tc.Input, tc.MaxNgram)
if !reflect.DeepEqual(output, tc.Output) {
t.Fatalf("%s: %#v", tc.Input, output)
}
}
}