wip

2024-11-11 06:36:55 +01:00
parent d0587cc585
commit c5419d662e
102 changed files with 4181 additions and 0 deletions
--- a/tagengine/README.md
+++ b/tagengine/README.md
@@ -0,0 +1,2 @@
+# tagengine
+
--- a/tagengine/go.mod
+++ b/tagengine/go.mod
@@ -0,0 +1,3 @@
+module git.crumpington.com/lib/tagengine
+
+go 1.23.2
--- a/tagengine/go.sum
+++ b/tagengine/go.sum
--- a/tagengine/ngram.go
+++ b/tagengine/ngram.go
@@ -0,0 +1,30 @@
+package tagengine
+
+import "unicode"
+
+func ngramLength(s string) int {
+	N := len(s)
+	i := 0
+	count := 0
+
+	for {
+		// Eat spaces.
+		for i < N && unicode.IsSpace(rune(s[i])) {
+			i++
+		}
+
+		// Done?
+		if i == N {
+			break
+		}
+
+		// Non-space!
+		count++
+
+		// Eat non-spaces.
+		for i < N && !unicode.IsSpace(rune(s[i])) {
+			i++
+		}
+	}
+	return count
+}
--- a/tagengine/ngram_test.go
+++ b/tagengine/ngram_test.go
@@ -0,0 +1,31 @@
+package tagengine
+
+import (
+	"log"
+	"testing"
+)
+
+func TestNGramLength(t *testing.T) {
+	type Case struct {
+		Input  string
+		Length int
+	}
+
+	cases := []Case{
+		{"a b c", 3},
+		{"  xyz\nlkj  dflaj a", 4},
+		{"a", 1},
+		{" a", 1},
+		{"a", 1},
+		{" a\n", 1},
+		{" a ", 1},
+		{"\tx\ny\nz q ", 4},
+	}
+
+	for _, tc := range cases {
+		length := ngramLength(tc.Input)
+		if length != tc.Length {
+			log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
+		}
+	}
+}
--- a/tagengine/node.go
+++ b/tagengine/node.go
@@ -0,0 +1,79 @@
+package tagengine
+
+import (
+	"fmt"
+	"strings"
+)
+
+type node struct {
+	Token    string
+	Matches  []*Rule // If a list of tokens reaches this node, it matches these.
+	Children map[string]*node
+}
+
+func (n *node) AddRule(r *Rule) {
+	n.addRule(r, 0)
+}
+
+func (n *node) addRule(r *Rule, idx int) {
+	if len(r.Includes) == idx {
+		n.Matches = append(n.Matches, r)
+		return
+	}
+
+	token := r.Includes[idx]
+
+	child, ok := n.Children[token]
+	if !ok {
+		child = &node{
+			Token:    token,
+			Children: map[string]*node{},
+		}
+		n.Children[token] = child
+	}
+
+	child.addRule(r, idx+1)
+}
+
+// Note that tokens must be sorted. This is the case for tokens created from
+// the tokenize function.
+func (n *node) Match(tokens []string) (rules []*Rule) {
+	return n.match(tokens, rules)
+}
+
+func (n *node) match(tokens []string, rules []*Rule) []*Rule {
+	// Check for a match.
+	if n.Matches != nil {
+		rules = append(rules, n.Matches...)
+	}
+
+	if len(tokens) == 0 {
+		return rules
+	}
+
+	// Attempt to match children.
+	for i := 0; i < len(tokens); i++ {
+		token := tokens[i]
+		if child, ok := n.Children[token]; ok {
+			rules = child.match(tokens[i+1:], rules)
+		}
+	}
+
+	return rules
+}
+
+func (n *node) Dump() {
+	n.dump(0)
+}
+
+func (n *node) dump(depth int) {
+	indent := strings.Repeat(" ", 2*depth)
+	tag := ""
+	for _, m := range n.Matches {
+		tag += " " + m.Tag
+	}
+	fmt.Printf("%s%s%s\n", indent, n.Token, tag)
+	for _, child := range n.Children {
+		child.dump(depth + 1)
+	}
+}
--- a/tagengine/rule.go
+++ b/tagengine/rule.go
@@ -0,0 +1,159 @@
+package tagengine
+
+type Rule struct {
+	// The purpose of a Rule is to attach it's Tag to matching text.
+	Tag string
+
+	// Includes is a list of strings that must be found in the input in order to
+	// match.
+	Includes []string
+
+	// Excludes is a list of strings that can exclude a match for this rule.
+	Excludes []string
+
+	// Blocks: If this rule is matched, then it will block matches of any tags
+	// listed here.
+	Blocks []string
+
+	// The Score encodes the complexity of the Rule. A higher score indicates a
+	// more specific match. A Rule more includes, or includes with multiple words
+	// should havee a higher Score than a Rule with fewer includes or less
+	// complex includes.
+	Score int
+
+	excludes map[string]struct{}
+}
+
+func NewRule(tag string) Rule {
+	return Rule{Tag: tag}
+}
+
+func (r Rule) Inc(l ...string) Rule {
+	return Rule{
+		Tag:      r.Tag,
+		Includes: append(r.Includes, l...),
+		Excludes: r.Excludes,
+		Blocks:   r.Blocks,
+	}
+}
+
+func (r Rule) Exc(l ...string) Rule {
+	return Rule{
+		Tag:      r.Tag,
+		Includes: r.Includes,
+		Excludes: append(r.Excludes, l...),
+		Blocks:   r.Blocks,
+	}
+}
+
+func (r Rule) Block(l ...string) Rule {
+	return Rule{
+		Tag:      r.Tag,
+		Includes: r.Includes,
+		Excludes: r.Excludes,
+		Blocks:   append(r.Blocks, l...),
+	}
+}
+
+func (rule *Rule) normalize(sanitize func(string) string) {
+	for i, token := range rule.Includes {
+		rule.Includes[i] = sanitize(token)
+	}
+	for i, token := range rule.Excludes {
+		rule.Excludes[i] = sanitize(token)
+	}
+
+	sortTokens(rule.Includes)
+	sortTokens(rule.Excludes)
+
+	rule.excludes = map[string]struct{}{}
+	for _, s := range rule.Excludes {
+		rule.excludes[s] = struct{}{}
+	}
+
+	rule.Score = rule.computeScore()
+}
+
+func (r Rule) maxNGram() int {
+	max := 0
+	for _, s := range r.Includes {
+		n := ngramLength(s)
+		if n > max {
+			max = n
+		}
+	}
+	for _, s := range r.Excludes {
+		n := ngramLength(s)
+		if n > max {
+			max = n
+		}
+	}
+
+	return max
+}
+
+func (r Rule) isExcluded(tokens []string) bool {
+	// This is most often the case.
+	if len(r.excludes) == 0 {
+		return false
+	}
+
+	for _, s := range tokens {
+		if _, ok := r.excludes[s]; ok {
+			return true
+		}
+	}
+	return false
+}
+
+func (r Rule) computeScore() (score int) {
+	for _, token := range r.Includes {
+		n := ngramLength(token)
+		score += n * (n + 1) / 2
+	}
+	return score
+}
+
+func ruleLess(lhs, rhs *Rule) bool {
+	// If scores differ, sort by score.
+	if lhs.Score != rhs.Score {
+		return lhs.Score < rhs.Score
+	}
+
+	// If include depth differs, sort by depth.
+	lDepth := len(lhs.Includes)
+	rDepth := len(rhs.Includes)
+
+	if lDepth != rDepth {
+		return lDepth < rDepth
+	}
+
+	// If exclude depth differs, sort by depth.
+	lDepth = len(lhs.Excludes)
+	rDepth = len(rhs.Excludes)
+
+	if lDepth != rDepth {
+		return lDepth < rDepth
+	}
+
+	// Sort alphabetically by includes.
+	for i := range lhs.Includes {
+		if lhs.Includes[i] != rhs.Includes[i] {
+			return lhs.Includes[i] < rhs.Includes[i]
+		}
+	}
+
+	// Sort by alphabetically by excludes.
+	for i := range lhs.Excludes {
+		if lhs.Excludes[i] != rhs.Excludes[i] {
+			return lhs.Excludes[i] < rhs.Excludes[i]
+		}
+	}
+
+	// Sort by tag.
+	if lhs.Tag != rhs.Tag {
+		return lhs.Tag < rhs.Tag
+	}
+
+	return false
+}
--- a/tagengine/rulegroup.go
+++ b/tagengine/rulegroup.go
@@ -0,0 +1,58 @@
+package tagengine
+
+// A RuleGroup can be converted into a list of rules. Each rule will point to
+// the same tag, and have the same exclude set and blocks.
+type RuleGroup struct {
+	Tag      string
+	Includes [][]string
+	Excludes []string
+	Blocks   []string
+}
+
+func NewRuleGroup(tag string) RuleGroup {
+	return RuleGroup{
+		Tag:      tag,
+		Includes: [][]string{},
+		Excludes: []string{},
+		Blocks:   []string{},
+	}
+}
+
+func (g RuleGroup) Inc(l ...string) RuleGroup {
+	return RuleGroup{
+		Tag:      g.Tag,
+		Includes: append(g.Includes, l),
+		Excludes: g.Excludes,
+		Blocks:   g.Blocks,
+	}
+}
+
+func (g RuleGroup) Exc(l ...string) RuleGroup {
+	return RuleGroup{
+		Tag:      g.Tag,
+		Includes: g.Includes,
+		Excludes: append(g.Excludes, l...),
+		Blocks:   g.Blocks,
+	}
+}
+
+func (g RuleGroup) Block(l ...string) RuleGroup {
+	return RuleGroup{
+		Tag:      g.Tag,
+		Includes: g.Includes,
+		Excludes: g.Excludes,
+		Blocks:   append(g.Blocks, l...),
+	}
+}
+
+func (g RuleGroup) ToList() (l []Rule) {
+	for _, includes := range g.Includes {
+		l = append(l, Rule{
+			Tag:      g.Tag,
+			Excludes: g.Excludes,
+			Includes: includes,
+			Blocks:   g.Blocks,
+		})
+	}
+	return
+}
--- a/tagengine/ruleset.go
+++ b/tagengine/ruleset.go
@@ -0,0 +1,162 @@
+package tagengine
+
+import (
+	"sort"
+)
+
+type RuleSet struct {
+	root     *node
+	maxNgram int
+	sanitize func(string) string
+	rules    []*Rule
+}
+
+func NewRuleSet() *RuleSet {
+	return &RuleSet{
+		root: &node{
+			Token:    "/",
+			Children: map[string]*node{},
+		},
+		sanitize: BasicSanitizer,
+		rules:    []*Rule{},
+	}
+}
+
+func NewRuleSetFromList(rules []Rule) *RuleSet {
+	rs := NewRuleSet()
+	rs.AddRule(rules...)
+	return rs
+}
+
+func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
+	for _, ix := range ruleOrGroup {
+		switch x := ix.(type) {
+		case Rule:
+			t.AddRule(x)
+		case RuleGroup:
+			t.AddRuleGroup(x)
+		default:
+			panic("Add expects either Rule or RuleGroup objects.")
+		}
+	}
+}
+
+func (t *RuleSet) AddRule(rules ...Rule) {
+	for _, rule := range rules {
+		rule := rule
+
+		// Make sure rule is well-formed.
+		rule.normalize(t.sanitize)
+
+		// Update maxNgram.
+		N := rule.maxNGram()
+		if N > t.maxNgram {
+			t.maxNgram = N
+		}
+
+		t.rules = append(t.rules, &rule)
+		t.root.AddRule(&rule)
+	}
+}
+
+func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
+	for _, rg := range ruleGroups {
+		t.AddRule(rg.ToList()...)
+	}
+}
+
+// MatchRules will return a list of all matching rules. The rules are sorted by
+// the match's Score. The best match will be first.
+func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
+	input = t.sanitize(input)
+	tokens := Tokenize(input, t.maxNgram)
+
+	rules = t.root.Match(tokens)
+	if len(rules) == 0 {
+		return rules
+	}
+
+	// Check excludes.
+	l := rules[:0]
+	for _, r := range rules {
+		if !r.isExcluded(tokens) {
+			l = append(l, r)
+		}
+	}
+
+	rules = l
+
+	// Sort rules descending.
+	sort.Slice(rules, func(i, j int) bool {
+		return ruleLess(rules[j], rules[i])
+	})
+
+	return rules
+}
+
+type Match struct {
+	Tag string
+
+	// Confidence is used to sort all matches, and is normalized so the sum of
+	// Confidence values for all matches is 1. Confidence is relative to the
+	// number of matches and the size of matches in terms of number of tokens.
+	Confidence float64 // In the range (0,1].
+}
+
+// Return a list of matches with confidence. This is useful if you'd like to
+// find the best matching rule out of all the matched rules.
+//
+// If you just want to find all matching rules, then use MatchRules.
+func (t *RuleSet) Match(input string) []Match {
+	rules := t.MatchRules(input)
+	if len(rules) == 0 {
+		return []Match{}
+	}
+	if len(rules) == 1 {
+		return []Match{{
+			Tag:        rules[0].Tag,
+			Confidence: 1,
+		}}
+	}
+
+	// Create list of blocked tags.
+	blocks := map[string]struct{}{}
+	for _, rule := range rules {
+		for _, tag := range rule.Blocks {
+			blocks[tag] = struct{}{}
+		}
+	}
+
+	// Remove rules for blocked tags.
+	iOut := 0
+	for _, rule := range rules {
+		if _, ok := blocks[rule.Tag]; ok {
+			continue
+		}
+		rules[iOut] = rule
+		iOut++
+	}
+	rules = rules[:iOut]
+
+	// Matches by index.
+	matches := map[string]int{}
+	out := []Match{}
+	sum := float64(0)
+
+	for _, rule := range rules {
+		idx, ok := matches[rule.Tag]
+		if !ok {
+			idx = len(matches)
+			matches[rule.Tag] = idx
+			out = append(out, Match{Tag: rule.Tag})
+		}
+		out[idx].Confidence += float64(rule.Score)
+		sum += float64(rule.Score)
+	}
+
+	for i := range out {
+		out[i].Confidence /= sum
+	}
+
+	return out
+}
--- a/tagengine/ruleset_test.go
+++ b/tagengine/ruleset_test.go
@@ -0,0 +1,84 @@
+package tagengine
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestRulesSet(t *testing.T) {
+	rs := NewRuleSet()
+	rs.AddRule(Rule{
+		Tag:      "cc/2",
+		Includes: []string{"cola", "coca"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cc/0",
+		Includes: []string{"coca cola"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cz/2",
+		Includes: []string{"coca", "zero"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cc0/3",
+		Includes: []string{"zero", "coca", "cola"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "cc0/3.1",
+		Includes: []string{"coca", "cola", "zero"},
+		Excludes: []string{"pepsi"},
+	})
+	rs.AddRule(Rule{
+		Tag:      "spa",
+		Includes: []string{"spa"},
+		Blocks:   []string{"cc/0", "cc0/3", "cc0/3.1"},
+	})
+
+	type TestCase struct {
+		Input   string
+		Matches []Match
+	}
+
+	cases := []TestCase{
+		{
+			Input: "coca-cola zero",
+			Matches: []Match{
+				{"cc0/3.1", 0.3},
+				{"cc0/3", 0.3},
+				{"cz/2", 0.2},
+				{"cc/2", 0.2},
+			},
+		}, {
+			Input: "coca cola",
+			Matches: []Match{
+				{"cc/0", 0.6},
+				{"cc/2", 0.4},
+			},
+		}, {
+			Input: "coca cola zero pepsi",
+			Matches: []Match{
+				{"cc0/3", 0.3},
+				{"cc/0", 0.3},
+				{"cz/2", 0.2},
+				{"cc/2", 0.2},
+			},
+		}, {
+			Input:   "fanta orange",
+			Matches: []Match{},
+		}, {
+			Input: "coca-cola zero / fanta / spa",
+			Matches: []Match{
+				{"cz/2", 0.4},
+				{"cc/2", 0.4},
+				{"spa", 0.2},
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		matches := rs.Match(tc.Input)
+		if !reflect.DeepEqual(matches, tc.Matches) {
+			t.Fatalf("%v != %v", matches, tc.Matches)
+		}
+	}
+}
--- a/tagengine/sanitize.go
+++ b/tagengine/sanitize.go
@@ -0,0 +1,20 @@
+package tagengine
+
+import (
+	"strings"
+
+	"git.crumpington.com/lib/tagengine/sanitize"
+)
+
+// The basic sanitizer:
+// * lower-case
+// * put spaces around numbers
+// * put slaces around punctuation
+// * collapse multiple spaces
+func BasicSanitizer(s string) string {
+	s = strings.ToLower(s)
+	s = sanitize.SpaceNumbers(s)
+	s = sanitize.SpacePunctuation(s)
+	s = sanitize.CollapseSpaces(s)
+	return s
+}
--- a/tagengine/sanitize/sanitize.go
+++ b/tagengine/sanitize/sanitize.go
@@ -0,0 +1,91 @@
+package sanitize
+
+import (
+	"strings"
+	"unicode"
+)
+
+func SpaceNumbers(s string) string {
+	if len(s) == 0 {
+		return s
+	}
+
+	isDigit := func(b rune) bool {
+		switch b {
+		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+			return true
+		}
+		return false
+	}
+
+	b := strings.Builder{}
+
+	var first rune
+	for _, c := range s {
+		first = c
+		break
+	}
+
+	digit := isDigit(first)
+
+	// Range over runes.
+	for _, c := range s {
+		thisDigit := isDigit(c)
+		if thisDigit != digit {
+			b.WriteByte(' ')
+			digit = thisDigit
+		}
+		b.WriteRune(c)
+	}
+
+	return b.String()
+}
+
+func SpacePunctuation(s string) string {
+	needsSpace := func(r rune) bool {
+		switch r {
+		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
+			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
+			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
+			return true
+		}
+		return false
+	}
+
+	b := strings.Builder{}
+
+	// Range over runes.
+	for _, r := range s {
+		if needsSpace(r) {
+			b.WriteRune(' ')
+			b.WriteRune(r)
+			b.WriteRune(' ')
+		} else {
+			b.WriteRune(r)
+		}
+	}
+
+	return b.String()
+}
+
+func CollapseSpaces(s string) string {
+	// Trim leading and trailing spaces.
+	s = strings.TrimSpace(s)
+
+	b := strings.Builder{}
+	wasSpace := false
+
+	// Range over runes.
+	for _, c := range s {
+		if unicode.IsSpace(c) {
+			wasSpace = true
+			continue
+		} else if wasSpace {
+			wasSpace = false
+			b.WriteRune(' ')
+		}
+		b.WriteRune(c)
+	}
+
+	return b.String()
+}
--- a/tagengine/sanitize_test.go
+++ b/tagengine/sanitize_test.go
@@ -0,0 +1,30 @@
+package tagengine
+
+import "testing"
+
+func TestSanitize(t *testing.T) {
+	sanitize := BasicSanitizer
+
+	type Case struct {
+		In  string
+		Out string
+	}
+
+	cases := []Case{
+		{"", ""},
+		{"123abc", "123 abc"},
+		{"abc123", "abc 123"},
+		{"abc123xyz", "abc 123 xyz"},
+		{"1f2", "1 f 2"},
+		{" abc", "abc"},
+		{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
+		{"€", "€"},
+	}
+
+	for _, tc := range cases {
+		out := sanitize(tc.In)
+		if out != tc.Out {
+			t.Fatalf("%v != %v", out, tc.Out)
+		}
+	}
+}
--- a/tagengine/tokenize.go
+++ b/tagengine/tokenize.go
@@ -0,0 +1,63 @@
+package tagengine
+
+import (
+	"sort"
+	"strings"
+)
+
+var ignoreTokens = map[string]struct{}{}
+
+func init() {
+	// These on their own are ignored.
+	tokens := []string{
+		"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
+		`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
+		`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
+	}
+	for _, s := range tokens {
+		ignoreTokens[s] = struct{}{}
+	}
+}
+
+func Tokenize(
+	input string,
+	maxNgram int,
+) (
+	tokens []string,
+) {
+	// Avoid duplicate ngrams.
+	ignored := map[string]bool{}
+
+	fields := strings.Fields(input)
+
+	if len(fields) < maxNgram {
+		maxNgram = len(fields)
+	}
+
+	for i := 1; i < maxNgram+1; i++ {
+		jMax := len(fields) - i + 1
+
+		for j := 0; j < jMax; j++ {
+			ngram := strings.Join(fields[j:i+j], " ")
+			if _, ok := ignoreTokens[ngram]; !ok {
+				if _, ok := ignored[ngram]; !ok {
+					tokens = append(tokens, ngram)
+					ignored[ngram] = true
+				}
+			}
+		}
+	}
+
+	sortTokens(tokens)
+
+	return tokens
+}
+
+func sortTokens(tokens []string) {
+	sort.Slice(tokens, func(i, j int) bool {
+		if len(tokens[i]) != len(tokens[j]) {
+			return len(tokens[i]) < len(tokens[j])
+		}
+		return tokens[i] < tokens[j]
+	})
+}
--- a/tagengine/tokenize_test.go
+++ b/tagengine/tokenize_test.go
@@ -0,0 +1,55 @@
+package tagengine
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestTokenize(t *testing.T) {
+	type Case struct {
+		Input    string
+		MaxNgram int
+		Output   []string
+	}
+
+	cases := []Case{
+		{
+			Input:    "a bb c d",
+			MaxNgram: 3,
+			Output: []string{
+				"a", "c", "d", "bb",
+				"c d", "a bb", "bb c",
+				"a bb c", "bb c d",
+			},
+		}, {
+			Input:    "a b",
+			MaxNgram: 3,
+			Output: []string{
+				"a", "b", "a b",
+			},
+		}, {
+			Input:    "- b c d",
+			MaxNgram: 3,
+			Output: []string{
+				"b", "c", "d",
+				"- b", "b c", "c d",
+				"- b c", "b c d",
+			},
+		}, {
+			Input:    "a a b c d c d",
+			MaxNgram: 3,
+			Output: []string{
+				"a", "b", "c", "d",
+				"a a", "a b", "b c", "c d", "d c",
+				"a a b", "a b c", "b c d", "c d c", "d c d",
+			},
+		},
+	}
+
+	for _, tc := range cases {
+		output := Tokenize(tc.Input, tc.MaxNgram)
+		if !reflect.DeepEqual(output, tc.Output) {
+			t.Fatalf("%s: %#v", tc.Input, output)
+		}
+	}
+}