Cleanup for v1.

2023-10-13 13:06:20 +02:00 · 2023-10-13 13:06:20 +02:00 · f4927aaed4
parent 0a77a882f1
commit f4927aaed4
9 changed files with 152 additions and 186 deletions
--- a/go.mod
+++ b/go.mod
@ -1,5 +1,3 @@
 module git.crumpington.com/public/tagengine
-go 1.17
+go 1.21.1
 require golang.org/x/text v0.3.7
--- a/go.sum
+++ b/go.sum
@ -1,3 +0,0 @@
 golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
 golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
 golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
--- a/node.go
+++ b/node.go
@ -7,7 +7,7 @@ import (
 type node struct {
 	Token    string
-	Matches  []*Rule
+	Matches  []*Rule // If a list of tokens reaches this node, it matches these.
 	Children map[string]*node
 }
--- a/rule.go
+++ b/rule.go
@ -1,15 +1,25 @@
 package tagengine
 type Rule struct {
-	Tag      string
+	// The purpose of a Rule is to attach it's Tag to matching text.
 	Tag string
 	// Includes is a list of strings that must be found in the input in order to
 	// match.
 	Includes []string
 	// Excludes is a list of strings that can exclude a match for this rule.
 	Excludes []string
 	Blocks   []string // List of blocked tags.
-	MatchCount int
+	// Blocks: If this rule is matched, then it will block matches of any tags
-	FirstCount int
+	// listed here.
 	Blocks []string
-	score int
+	// The Score encodes the complexity of the Rule. A higher score indicates a
 	// more specific match. A Rule more includes, or includes with multiple words
 	// should havee a higher Score than a Rule with fewer includes or less
 	// complex includes.
 	Score int
 	excludes map[string]struct{}
 }
@ -21,7 +31,7 @@ func NewRule(tag string) Rule {
 func (r Rule) Inc(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
-		Includes: l,
+		Includes: append(r.Includes, l...),
 		Excludes: r.Excludes,
 		Blocks:   r.Blocks,
 	}
@ -31,7 +41,7 @@ func (r Rule) Exc(l ...string) Rule {
 	return Rule{
 		Tag:      r.Tag,
 		Includes: r.Includes,
-		Excludes: l,
+		Excludes: append(r.Excludes, l...),
 		Blocks:   r.Blocks,
 	}
 }
@ -41,13 +51,11 @@ func (r Rule) Block(l ...string) Rule {
 		Tag:      r.Tag,
 		Includes: r.Includes,
 		Excludes: r.Excludes,
-		Blocks:   l,
+		Blocks:   append(r.Blocks, l...),
 	}
 }
-func (rule *Rule) normalize() {
+func (rule *Rule) normalize(sanitize func(string) string) {
 	sanitize := newSanitizer()
 	for i, token := range rule.Includes {
 		rule.Includes[i] = sanitize(token)
 	}
@ -63,7 +71,7 @@ func (rule *Rule) normalize() {
 		rule.excludes[s] = struct{}{}
 	}
-	rule.score = rule.computeScore()
+	rule.Score = rule.computeScore()
 }
 func (r Rule) maxNGram() int {
@ -108,8 +116,8 @@ func (r Rule) computeScore() (score int) {
 func ruleLess(lhs, rhs *Rule) bool {
 	// If scores differ, sort by score.
-	if lhs.score != rhs.score {
+	if lhs.Score != rhs.Score {
-		return lhs.score < rhs.score
+		return lhs.Score < rhs.Score
 	}
 	// If include depth differs, sort by depth.
--- a/rulegroup.go
+++ b/rulegroup.go
@ -1,7 +1,7 @@
 package tagengine
 // A RuleGroup can be converted into a list of rules. Each rule will point to
-// the same tag, and have the same exclude set.
+// the same tag, and have the same exclude set and blocks.
 type RuleGroup struct {
 	Tag      string
 	Includes [][]string
@ -31,7 +31,7 @@ func (g RuleGroup) Exc(l ...string) RuleGroup {
 	return RuleGroup{
 		Tag:      g.Tag,
 		Includes: g.Includes,
-		Excludes: l,
+		Excludes: append(g.Excludes, l...),
 		Blocks:   g.Blocks,
 	}
 }
@ -41,16 +41,17 @@ func (g RuleGroup) Block(l ...string) RuleGroup {
 		Tag:      g.Tag,
 		Includes: g.Includes,
 		Excludes: g.Excludes,
-		Blocks:   l,
+		Blocks:   append(g.Blocks, l...),
 	}
 }
-func (rg RuleGroup) ToList() (l []Rule) {
+func (g RuleGroup) ToList() (l []Rule) {
-	for _, includes := range rg.Includes {
+	for _, includes := range g.Includes {
 		l = append(l, Rule{
-			Tag:      rg.Tag,
+			Tag:      g.Tag,
-			Excludes: rg.Excludes,
+			Excludes: g.Excludes,
 			Includes: includes,
 			Blocks:   g.Blocks,
 		})
 	}
 	return
--- a/ruleset.go
+++ b/ruleset.go
@ -7,7 +7,7 @@ import (
 type RuleSet struct {
 	root     *node
 	maxNgram int
-	sanitize func(...string) string
+	sanitize func(string) string
 	rules    []*Rule
 }
@ -17,7 +17,7 @@ func NewRuleSet() *RuleSet {
 			Token:    "/",
 			Children: map[string]*node{},
 		},
-		sanitize: newSanitizer(),
+		sanitize: BasicSanitizer,
 		rules:    []*Rule{},
 	}
 }
@ -46,7 +46,7 @@ func (t *RuleSet) AddRule(rules ...Rule) {
 		rule := rule
 		// Make sure rule is well-formed.
-		rule.normalize()
+		rule.normalize(t.sanitize)
 		// Update maxNgram.
 		N := rule.maxNGram()
@ -66,7 +66,7 @@ func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
 }
 // MatchRules will return a list of all matching rules. The rules are sorted by
-// the match's "score". The best match will be first.
+// the match's Score. The best match will be first.
 func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
 	input = t.sanitize(input)
 	tokens := Tokenize(input, t.maxNgram)
@ -91,23 +91,22 @@ func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
 		return ruleLess(rules[j], rules[i])
 	})
 	// Update rule stats.
 	if len(rules) > 0 {
 		rules[0].FirstCount++
 		for _, r := range rules {
 			r.MatchCount++
 		}
 	}
 	return rules
 }
 type Match struct {
-	Tag        string
+	Tag string
 	// Confidence is used to sort all matches, and is normalized so the sum of
 	// Confidence values for all matches is 1. Confidence is relative to the
 	// number of matches and the size of matches in terms of number of tokens.
 	Confidence float64 // In the range (0,1].
 }
-// Return a list of matches with confidence.
+// Return a list of matches with confidence. This is useful if you'd like to
 // find the best matching rule out of all the matched rules.
 //
 // If you just want to find all matching rules, then use MatchRules.
 func (t *RuleSet) Match(input string) []Match {
 	rules := t.MatchRules(input)
 	if len(rules) == 0 {
@ -151,8 +150,8 @@ func (t *RuleSet) Match(input string) []Match {
 			matches[rule.Tag] = idx
 			out = append(out, Match{Tag: rule.Tag})
 		}
-		out[idx].Confidence += float64(rule.score)
+		out[idx].Confidence += float64(rule.Score)
-		sum += float64(rule.score)
+		sum += float64(rule.Score)
 	}
 	for i := range out {
@ -161,21 +160,3 @@ func (t *RuleSet) Match(input string) []Match {
 	return out
 }
 // ListRules returns rules used in the ruleset sorted by the rules'
 // FirstCount. This is the number of times the given rule was the best match to
 // an input.
 func (t *RuleSet) ListRules() []*Rule {
 	sort.Slice(t.rules, func(i, j int) bool {
 		if t.rules[j].FirstCount != t.rules[i].FirstCount {
 			return t.rules[j].FirstCount < t.rules[i].FirstCount
 		}
 		if t.rules[j].MatchCount != t.rules[i].MatchCount {
 			return t.rules[j].MatchCount < t.rules[i].MatchCount
 		}
 		return t.rules[j].Tag < t.rules[i].Tag
 	})
 	return t.rules
 }
--- a/sanitize.go
+++ b/sanitize.go
@ -2,124 +2,19 @@ package tagengine
 import (
 	"strings"
 	"unicode"
-	"golang.org/x/text/runes"
+	"git.crumpington.com/public/tagengine/sanitize"
 	"golang.org/x/text/transform"
 	"golang.org/x/text/unicode/norm"
 )
-func newSanitizer() func(...string) string {
+// The basic sanitizer:
-	diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
+// * lower-case
-
+// * put spaces around numbers
-	return func(l ...string) string {
+// * put slaces around punctuation
-
+// * collapse multiple spaces
-		s := strings.Join(l, " ")
+func BasicSanitizer(s string) string {
-
+	s = strings.ToLower(s)
-		// Lowercase.
+	s = sanitize.SpaceNumbers(s)
-		s = strings.ToLower(s)
+	s = sanitize.SpacePunctuation(s)
-
+	s = sanitize.CollapseSpaces(s)
-		// Remove apostrophes.
+	return s
 		s = strings.ReplaceAll(s, "ß", "ss")
 		s = strings.ReplaceAll(s, "'s", "s")
 		s = strings.ReplaceAll(s, "`s", "s")
 		s = strings.ReplaceAll(s, "´s", "s")
 		// Remove diacritics.
 		if out, _, err := transform.String(diactricsFix, s); err == nil {
 			s = out
 		}
 		// Clean spaces.
 		s = spaceNumbers(s)
 		s = addSpaces(s)
 		s = collapseSpaces(s)
 		return s
 	}
 }
 func spaceNumbers(s string) string {
 	if len(s) == 0 {
 		return s
 	}
 	isDigit := func(b rune) bool {
 		switch b {
 		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 			return true
 		}
 		return false
 	}
 	b := strings.Builder{}
 	var first rune
 	for _, c := range s {
 		first = c
 		break
 	}
 	digit := isDigit(first)
 	// Range over runes.
 	for _, c := range s {
 		thisDigit := isDigit(c)
 		if thisDigit != digit {
 			b.WriteByte(' ')
 			digit = thisDigit
 		}
 		b.WriteRune(c)
 	}
 	return b.String()
 }
 func addSpaces(s string) string {
 	needsSpace := func(r rune) bool {
 		switch r {
 		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
 			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
 			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
 			return true
 		}
 		return false
 	}
 	b := strings.Builder{}
 	// Range over runes.
 	for _, r := range s {
 		if needsSpace(r) {
 			b.WriteRune(' ')
 			b.WriteRune(r)
 			b.WriteRune(' ')
 		} else {
 			b.WriteRune(r)
 		}
 	}
 	return b.String()
 }
 func collapseSpaces(s string) string {
 	// Trim leading and trailing spaces.
 	s = strings.TrimSpace(s)
 	b := strings.Builder{}
 	wasSpace := false
 	// Range over runes.
 	for _, c := range s {
 		if unicode.IsSpace(c) {
 			wasSpace = true
 			continue
 		} else if wasSpace {
 			wasSpace = false
 			b.WriteRune(' ')
 		}
 		b.WriteRune(c)
 	}
 	return b.String()
 }
--- a/sanitize/sanitize.go
+++ b/sanitize/sanitize.go
@ -0,0 +1,91 @@
 package sanitize
 import (
 	"strings"
 	"unicode"
 )
 func SpaceNumbers(s string) string {
 	if len(s) == 0 {
 		return s
 	}
 	isDigit := func(b rune) bool {
 		switch b {
 		case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 			return true
 		}
 		return false
 	}
 	b := strings.Builder{}
 	var first rune
 	for _, c := range s {
 		first = c
 		break
 	}
 	digit := isDigit(first)
 	// Range over runes.
 	for _, c := range s {
 		thisDigit := isDigit(c)
 		if thisDigit != digit {
 			b.WriteByte(' ')
 			digit = thisDigit
 		}
 		b.WriteRune(c)
 	}
 	return b.String()
 }
 func SpacePunctuation(s string) string {
 	needsSpace := func(r rune) bool {
 		switch r {
 		case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
 			'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
 			':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
 			return true
 		}
 		return false
 	}
 	b := strings.Builder{}
 	// Range over runes.
 	for _, r := range s {
 		if needsSpace(r) {
 			b.WriteRune(' ')
 			b.WriteRune(r)
 			b.WriteRune(' ')
 		} else {
 			b.WriteRune(r)
 		}
 	}
 	return b.String()
 }
 func CollapseSpaces(s string) string {
 	// Trim leading and trailing spaces.
 	s = strings.TrimSpace(s)
 	b := strings.Builder{}
 	wasSpace := false
 	// Range over runes.
 	for _, c := range s {
 		if unicode.IsSpace(c) {
 			wasSpace = true
 			continue
 		} else if wasSpace {
 			wasSpace = false
 			b.WriteRune(' ')
 		}
 		b.WriteRune(c)
 	}
 	return b.String()
 }
--- a/sanitize_test.go
+++ b/sanitize_test.go
@ -3,7 +3,7 @@ package tagengine
 import "testing"
 func TestSanitize(t *testing.T) {
-	sanitize := newSanitizer()
+	sanitize := BasicSanitizer
 	type Case struct {
 		In  string
@ -17,12 +17,7 @@ func TestSanitize(t *testing.T) {
 		{"abc123xyz", "abc 123 xyz"},
 		{"1f2", "1 f 2"},
 		{" abc", "abc"},
-		{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
+		{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
 		{" Pott`s gin   königs beer;SOJU  ", "potts gin konigs beer ; soju"},
 		{"brot & brötchen", "brot & brotchen"},
 		{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
 		{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
 		{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
 		{"€", "€"},
 	}