Cleanup for v1.

2023-10-13 13:06:20 +02:00 · 2023-10-13 13:06:20 +02:00 · f4927aaed4
parent 0a77a882f1
commit f4927aaed4
9 changed files with 152 additions and 186 deletions
--- a/go.mod
+++ b/go.mod
@ -1,5 +1,3 @@
 module git.crumpington.com/public/tagengine

-go 1.17
-
-require golang.org/x/text v0.3.7
+go 1.21.1
--- a/go.sum
+++ b/go.sum
@ -1,3 +0,0 @@
-golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
-golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
-golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/node.go
+++ b/node.go
@ -7,7 +7,7 @@ import (

 type node struct {
 	Token    string
-	Matches  []*Rule
+	Matches  []*Rule // If a list of tokens reaches this node, it matches these.
 	Children map[string]*node
 }

--- a/rule.go
+++ b/rule.go
@ -1,15 +1,25 @@
 package tagengine

 type Rule struct {
-	Tag      string
+	// The purpose of a Rule is to attach it's Tag to matching text.
+	Tag string
+
+	// Includes is a list of strings that must be found in the input in order to
+	// match.
 	Includes []string
+
+	// Excludes is a list of strings that can exclude a match for this rule.
 	Excludes []string
-	Blocks   []string // List of blocked tags.

-	MatchCount int
-	FirstCount int
+	// Blocks: If this rule is matched, then it will block matches of any tags
+	// listed here.
+	Blocks []string

-	score int
+	// The Score encodes the complexity of the Rule. A higher score indicates a
+	// more specific match. A Rule more includes, or includes with multiple words
+	// should havee a higher Score than a Rule with fewer includes or less
+	// complex includes.
+	Score int

 	excludes map[string]struct{}
 }
@ -21,7 +31,7 @@ func NewRule(tag string) Rule {
 func (r Rule) Inc(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
-		Includes: l,
+		Includes: append(r.Includes, l...),
 		Excludes: r.Excludes,
 		Blocks:   r.Blocks,
 	}
@ -31,7 +41,7 @@ func (r Rule) Exc(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
 		Includes: r.Includes,
-		Excludes: l,
+		Excludes: append(r.Excludes, l...),
 		Blocks:   r.Blocks,
 	}
 }
@ -41,13 +51,11 @@ func (r Rule) Block(l ...string) Rule {
 		Tag:      r.Tag,
 		Includes: r.Includes,
 		Excludes: r.Excludes,
-		Blocks:   l,
+		Blocks:   append(r.Blocks, l...),
 	}
 }

-func (rule *Rule) normalize() {
-	sanitize := newSanitizer()
-
+func (rule *Rule) normalize(sanitize func(string) string) {
 	for i, token := range rule.Includes {
 		rule.Includes[i] = sanitize(token)
 	}
@ -63,7 +71,7 @@ func (rule *Rule) normalize() {
 		rule.excludes[s] = struct{}{}
 	}

-	rule.score = rule.computeScore()
+	rule.Score = rule.computeScore()
 }

 func (r Rule) maxNGram() int {
@ -108,8 +116,8 @@ func (r Rule) computeScore() (score int) {

 func ruleLess(lhs, rhs *Rule) bool {
 	// If scores differ, sort by score.
-	if lhs.score != rhs.score {
-		return lhs.score < rhs.score
+	if lhs.Score != rhs.Score {
+		return lhs.Score < rhs.Score
 	}

 	// If include depth differs, sort by depth.
--- a/rulegroup.go
+++ b/rulegroup.go
@ -1,7 +1,7 @@
 package tagengine

 // A RuleGroup can be converted into a list of rules. Each rule will point to
-// the same tag, and have the same exclude set.
+// the same tag, and have the same exclude set and blocks.
 type RuleGroup struct {
 	Tag      string
 	Includes [][]string
@ -31,7 +31,7 @@ func (g RuleGroup) Exc(l ...string) RuleGroup {
 	return RuleGroup{
 		Tag:      g.Tag,
 		Includes: g.Includes,
-		Excludes: l,
+		Excludes: append(g.Excludes, l...),
 		Blocks:   g.Blocks,
 	}
 }
@ -41,16 +41,17 @@ func (g RuleGroup) Block(l ...string) RuleGroup {
 		Tag:      g.Tag,
 		Includes: g.Includes,
 		Excludes: g.Excludes,
-		Blocks:   l,
+		Blocks:   append(g.Blocks, l...),
 	}
 }

-func (rg RuleGroup) ToList() (l []Rule) {
-	for _, includes := range rg.Includes {
+func (g RuleGroup) ToList() (l []Rule) {
+	for _, includes := range g.Includes {
 		l = append(l, Rule{
-			Tag:      rg.Tag,
-			Excludes: rg.Excludes,
+			Tag:      g.Tag,
+			Excludes: g.Excludes,
 			Includes: includes,
+			Blocks:   g.Blocks,
 		})
 	}
 	return
--- a/ruleset.go
+++ b/ruleset.go
@ -7,7 +7,7 @@ import (
 type RuleSet struct {
 	root     *node
 	maxNgram int
-	sanitize func(...string) string
+	sanitize func(string) string
 	rules    []*Rule
 }

@ -17,7 +17,7 @@ func NewRuleSet() *RuleSet {
 			Token:    "/",
 			Children: map[string]*node{},
 		},
-		sanitize: newSanitizer(),
+		sanitize: BasicSanitizer,
 		rules:    []*Rule{},
 	}
 }
@ -46,7 +46,7 @@ func (t *RuleSet) AddRule(rules ...Rule) {
 		rule := rule

 		// Make sure rule is well-formed.
-		rule.normalize()
+		rule.normalize(t.sanitize)

 		// Update maxNgram.
 		N := rule.maxNGram()
@ -66,7 +66,7 @@ func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
 }

 // MatchRules will return a list of all matching rules. The rules are sorted by
-// the match's "score". The best match will be first.
+// the match's Score. The best match will be first.
 func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
 	input = t.sanitize(input)
 	tokens := Tokenize(input, t.maxNgram)
@ -91,23 +91,22 @@ func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
 		return ruleLess(rules[j], rules[i])
 	})

-	// Update rule stats.
-	if len(rules) > 0 {
-		rules[0].FirstCount++
-		for _, r := range rules {
-			r.MatchCount++
-		}
-	}
-
 	return rules
 }

 type Match struct {
-	Tag        string
+	Tag string
+
+	// Confidence is used to sort all matches, and is normalized so the sum of
+	// Confidence values for all matches is 1. Confidence is relative to the
+	// number of matches and the size of matches in terms of number of tokens.
 	Confidence float64 // In the range (0,1].
 }

-// Return a list of matches with confidence.
+// Return a list of matches with confidence. This is useful if you'd like to
+// find the best matching rule out of all the matched rules.
+//
+// If you just want to find all matching rules, then use MatchRules.
 func (t *RuleSet) Match(input string) []Match {
 	rules := t.MatchRules(input)
 	if len(rules) == 0 {
@ -151,8 +150,8 @@ func (t *RuleSet) Match(input string) []Match {
 			matches[rule.Tag] = idx
 			out = append(out, Match{Tag: rule.Tag})
 		}
-		out[idx].Confidence += float64(rule.score)
-		sum += float64(rule.score)
+		out[idx].Confidence += float64(rule.Score)
+		sum += float64(rule.Score)
 	}

 	for i := range out {
@ -161,21 +160,3 @@ func (t *RuleSet) Match(input string) []Match {

 	return out
 }
-
-// ListRules returns rules used in the ruleset sorted by the rules'
-// FirstCount. This is the number of times the given rule was the best match to
-// an input.
-func (t *RuleSet) ListRules() []*Rule {
-	sort.Slice(t.rules, func(i, j int) bool {
-		if t.rules[j].FirstCount != t.rules[i].FirstCount {
-			return t.rules[j].FirstCount < t.rules[i].FirstCount
-		}
-
-		if t.rules[j].MatchCount != t.rules[i].MatchCount {
-			return t.rules[j].MatchCount < t.rules[i].MatchCount
-		}
-
-		return t.rules[j].Tag < t.rules[i].Tag
-	})
-	return t.rules
-}
--- a/sanitize.go
+++ b/sanitize.go
@ -2,124 +2,19 @@ package tagengine

 import (
 	"strings"
-	"unicode"

-	"golang.org/x/text/runes"
-	"golang.org/x/text/transform"
-	"golang.org/x/text/unicode/norm"
+	"git.crumpington.com/public/tagengine/sanitize"
 )

-func newSanitizer() func(...string) string {
-	diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
-
-	return func(l ...string) string {
-
-		s := strings.Join(l, " ")
-
-		// Lowercase.
-		s = strings.ToLower(s)
-
-		// Remove apostrophes.
-		s = strings.ReplaceAll(s, "ß", "ss")
-		s = strings.ReplaceAll(s, "'s", "s")
-		s = strings.ReplaceAll(s, "`s", "s")
-		s = strings.ReplaceAll(s, "´s", "s")
-
-		// Remove diacritics.
-		if out, _, err := transform.String(diactricsFix, s); err == nil {
-			s = out
-		}
-
-		// Clean spaces.
-		s = spaceNumbers(s)
-		s = addSpaces(s)
-		s = collapseSpaces(s)
-
-		return s
-	}
-}
-
-func spaceNumbers(s string) string {
-	if len(s) == 0 {
-		return s
-	}
-
-	isDigit := func(b rune) bool {
-		switch b {
-		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
-			return true
-		}
-		return false
-	}
-
-	b := strings.Builder{}
-
-	var first rune
-	for _, c := range s {
-		first = c
-		break
-	}
-
-	digit := isDigit(first)
-
-	// Range over runes.
-	for _, c := range s {
-		thisDigit := isDigit(c)
-		if thisDigit != digit {
-			b.WriteByte(' ')
-			digit = thisDigit
-		}
-		b.WriteRune(c)
-	}
-
-	return b.String()
-}
-
-func addSpaces(s string) string {
-	needsSpace := func(r rune) bool {
-		switch r {
-		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
-			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
-			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
-			return true
-		}
-		return false
-	}
-
-	b := strings.Builder{}
-
-	// Range over runes.
-	for _, r := range s {
-		if needsSpace(r) {
-			b.WriteRune(' ')
-			b.WriteRune(r)
-			b.WriteRune(' ')
-		} else {
-			b.WriteRune(r)
-		}
-	}
-
-	return b.String()
-}
-
-func collapseSpaces(s string) string {
-	// Trim leading and trailing spaces.
-	s = strings.TrimSpace(s)
-
-	b := strings.Builder{}
-	wasSpace := false
-
-	// Range over runes.
-	for _, c := range s {
-		if unicode.IsSpace(c) {
-			wasSpace = true
-			continue
-		} else if wasSpace {
-			wasSpace = false
-			b.WriteRune(' ')
-		}
-		b.WriteRune(c)
-	}
-
-	return b.String()
+// The basic sanitizer:
+// * lower-case
+// * put spaces around numbers
+// * put slaces around punctuation
+// * collapse multiple spaces
+func BasicSanitizer(s string) string {
+	s = strings.ToLower(s)
+	s = sanitize.SpaceNumbers(s)
+	s = sanitize.SpacePunctuation(s)
+	s = sanitize.CollapseSpaces(s)
+	return s
 }
--- a/sanitize/sanitize.go
+++ b/sanitize/sanitize.go
@ -0,0 +1,91 @@
+package sanitize
+
+import (
+	"strings"
+	"unicode"
+)
+
+func SpaceNumbers(s string) string {
+	if len(s) == 0 {
+		return s
+	}
+
+	isDigit := func(b rune) bool {
+		switch b {
+		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
+			return true
+		}
+		return false
+	}
+
+	b := strings.Builder{}
+
+	var first rune
+	for _, c := range s {
+		first = c
+		break
+	}
+
+	digit := isDigit(first)
+
+	// Range over runes.
+	for _, c := range s {
+		thisDigit := isDigit(c)
+		if thisDigit != digit {
+			b.WriteByte(' ')
+			digit = thisDigit
+		}
+		b.WriteRune(c)
+	}
+
+	return b.String()
+}
+
+func SpacePunctuation(s string) string {
+	needsSpace := func(r rune) bool {
+		switch r {
+		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
+			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
+			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
+			return true
+		}
+		return false
+	}
+
+	b := strings.Builder{}
+
+	// Range over runes.
+	for _, r := range s {
+		if needsSpace(r) {
+			b.WriteRune(' ')
+			b.WriteRune(r)
+			b.WriteRune(' ')
+		} else {
+			b.WriteRune(r)
+		}
+	}
+
+	return b.String()
+}
+
+func CollapseSpaces(s string) string {
+	// Trim leading and trailing spaces.
+	s = strings.TrimSpace(s)
+
+	b := strings.Builder{}
+	wasSpace := false
+
+	// Range over runes.
+	for _, c := range s {
+		if unicode.IsSpace(c) {
+			wasSpace = true
+			continue
+		} else if wasSpace {
+			wasSpace = false
+			b.WriteRune(' ')
+		}
+		b.WriteRune(c)
+	}
+
+	return b.String()
+}
--- a/sanitize_test.go
+++ b/sanitize_test.go
@ -3,7 +3,7 @@ package tagengine
 import "testing"

 func TestSanitize(t *testing.T) {
-	sanitize := newSanitizer()
+	sanitize := BasicSanitizer

 	type Case struct {
 		In  string
@ -17,12 +17,7 @@ func TestSanitize(t *testing.T) {
 		{"abc123xyz", "abc 123 xyz"},
 		{"1f2", "1 f 2"},
 		{" abc", "abc"},
-		{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
-		{" Pott`s gin   königs beer;SOJU  ", "potts gin konigs beer ; soju"},
-		{"brot & brötchen", "brot & brotchen"},
-		{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
-		{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
-		{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
+		{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
 		{"€", "€"},
 	}