diff --git a/go.mod b/go.mod index f57ac59..ead4272 100644 --- a/go.mod +++ b/go.mod @@ -1,5 +1,3 @@ module git.crumpington.com/public/tagengine -go 1.17 - -require golang.org/x/text v0.3.7 +go 1.21.1 diff --git a/go.sum b/go.sum index 2274b80..e69de29 100644 --- a/go.sum +++ b/go.sum @@ -1,3 +0,0 @@ -golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk= -golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= -golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= diff --git a/node.go b/node.go index 0aec2f8..c48d982 100644 --- a/node.go +++ b/node.go @@ -7,7 +7,7 @@ import ( type node struct { Token string - Matches []*Rule + Matches []*Rule // If a list of tokens reaches this node, it matches these. Children map[string]*node } diff --git a/rule.go b/rule.go index ef23c2a..77e742e 100644 --- a/rule.go +++ b/rule.go @@ -1,15 +1,25 @@ package tagengine type Rule struct { - Tag string + // The purpose of a Rule is to attach it's Tag to matching text. + Tag string + + // Includes is a list of strings that must be found in the input in order to + // match. Includes []string + + // Excludes is a list of strings that can exclude a match for this rule. Excludes []string - Blocks []string // List of blocked tags. - MatchCount int - FirstCount int + // Blocks: If this rule is matched, then it will block matches of any tags + // listed here. + Blocks []string - score int + // The Score encodes the complexity of the Rule. A higher score indicates a + // more specific match. A Rule more includes, or includes with multiple words + // should havee a higher Score than a Rule with fewer includes or less + // complex includes. + Score int excludes map[string]struct{} } @@ -21,7 +31,7 @@ func NewRule(tag string) Rule { func (r Rule) Inc(l ...string) Rule { return Rule{ Tag: r.Tag, - Includes: l, + Includes: append(r.Includes, l...), Excludes: r.Excludes, Blocks: r.Blocks, } @@ -31,7 +41,7 @@ func (r Rule) Exc(l ...string) Rule { return Rule{ Tag: r.Tag, Includes: r.Includes, - Excludes: l, + Excludes: append(r.Excludes, l...), Blocks: r.Blocks, } } @@ -41,13 +51,11 @@ func (r Rule) Block(l ...string) Rule { Tag: r.Tag, Includes: r.Includes, Excludes: r.Excludes, - Blocks: l, + Blocks: append(r.Blocks, l...), } } -func (rule *Rule) normalize() { - sanitize := newSanitizer() - +func (rule *Rule) normalize(sanitize func(string) string) { for i, token := range rule.Includes { rule.Includes[i] = sanitize(token) } @@ -63,7 +71,7 @@ func (rule *Rule) normalize() { rule.excludes[s] = struct{}{} } - rule.score = rule.computeScore() + rule.Score = rule.computeScore() } func (r Rule) maxNGram() int { @@ -108,8 +116,8 @@ func (r Rule) computeScore() (score int) { func ruleLess(lhs, rhs *Rule) bool { // If scores differ, sort by score. - if lhs.score != rhs.score { - return lhs.score < rhs.score + if lhs.Score != rhs.Score { + return lhs.Score < rhs.Score } // If include depth differs, sort by depth. diff --git a/rulegroup.go b/rulegroup.go index 1409f8d..3a30657 100644 --- a/rulegroup.go +++ b/rulegroup.go @@ -1,7 +1,7 @@ package tagengine // A RuleGroup can be converted into a list of rules. Each rule will point to -// the same tag, and have the same exclude set. +// the same tag, and have the same exclude set and blocks. type RuleGroup struct { Tag string Includes [][]string @@ -31,7 +31,7 @@ func (g RuleGroup) Exc(l ...string) RuleGroup { return RuleGroup{ Tag: g.Tag, Includes: g.Includes, - Excludes: l, + Excludes: append(g.Excludes, l...), Blocks: g.Blocks, } } @@ -41,16 +41,17 @@ func (g RuleGroup) Block(l ...string) RuleGroup { Tag: g.Tag, Includes: g.Includes, Excludes: g.Excludes, - Blocks: l, + Blocks: append(g.Blocks, l...), } } -func (rg RuleGroup) ToList() (l []Rule) { - for _, includes := range rg.Includes { +func (g RuleGroup) ToList() (l []Rule) { + for _, includes := range g.Includes { l = append(l, Rule{ - Tag: rg.Tag, - Excludes: rg.Excludes, + Tag: g.Tag, + Excludes: g.Excludes, Includes: includes, + Blocks: g.Blocks, }) } return diff --git a/ruleset.go b/ruleset.go index c06a16e..1efe341 100644 --- a/ruleset.go +++ b/ruleset.go @@ -7,7 +7,7 @@ import ( type RuleSet struct { root *node maxNgram int - sanitize func(...string) string + sanitize func(string) string rules []*Rule } @@ -17,7 +17,7 @@ func NewRuleSet() *RuleSet { Token: "/", Children: map[string]*node{}, }, - sanitize: newSanitizer(), + sanitize: BasicSanitizer, rules: []*Rule{}, } } @@ -46,7 +46,7 @@ func (t *RuleSet) AddRule(rules ...Rule) { rule := rule // Make sure rule is well-formed. - rule.normalize() + rule.normalize(t.sanitize) // Update maxNgram. N := rule.maxNGram() @@ -66,7 +66,7 @@ func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) { } // MatchRules will return a list of all matching rules. The rules are sorted by -// the match's "score". The best match will be first. +// the match's Score. The best match will be first. func (t *RuleSet) MatchRules(input string) (rules []*Rule) { input = t.sanitize(input) tokens := Tokenize(input, t.maxNgram) @@ -91,23 +91,22 @@ func (t *RuleSet) MatchRules(input string) (rules []*Rule) { return ruleLess(rules[j], rules[i]) }) - // Update rule stats. - if len(rules) > 0 { - rules[0].FirstCount++ - for _, r := range rules { - r.MatchCount++ - } - } - return rules } type Match struct { - Tag string + Tag string + + // Confidence is used to sort all matches, and is normalized so the sum of + // Confidence values for all matches is 1. Confidence is relative to the + // number of matches and the size of matches in terms of number of tokens. Confidence float64 // In the range (0,1]. } -// Return a list of matches with confidence. +// Return a list of matches with confidence. This is useful if you'd like to +// find the best matching rule out of all the matched rules. +// +// If you just want to find all matching rules, then use MatchRules. func (t *RuleSet) Match(input string) []Match { rules := t.MatchRules(input) if len(rules) == 0 { @@ -151,8 +150,8 @@ func (t *RuleSet) Match(input string) []Match { matches[rule.Tag] = idx out = append(out, Match{Tag: rule.Tag}) } - out[idx].Confidence += float64(rule.score) - sum += float64(rule.score) + out[idx].Confidence += float64(rule.Score) + sum += float64(rule.Score) } for i := range out { @@ -161,21 +160,3 @@ func (t *RuleSet) Match(input string) []Match { return out } - -// ListRules returns rules used in the ruleset sorted by the rules' -// FirstCount. This is the number of times the given rule was the best match to -// an input. -func (t *RuleSet) ListRules() []*Rule { - sort.Slice(t.rules, func(i, j int) bool { - if t.rules[j].FirstCount != t.rules[i].FirstCount { - return t.rules[j].FirstCount < t.rules[i].FirstCount - } - - if t.rules[j].MatchCount != t.rules[i].MatchCount { - return t.rules[j].MatchCount < t.rules[i].MatchCount - } - - return t.rules[j].Tag < t.rules[i].Tag - }) - return t.rules -} diff --git a/sanitize.go b/sanitize.go index d1ed6f3..bcb966d 100644 --- a/sanitize.go +++ b/sanitize.go @@ -2,124 +2,19 @@ package tagengine import ( "strings" - "unicode" - "golang.org/x/text/runes" - "golang.org/x/text/transform" - "golang.org/x/text/unicode/norm" + "git.crumpington.com/public/tagengine/sanitize" ) -func newSanitizer() func(...string) string { - diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC) - - return func(l ...string) string { - - s := strings.Join(l, " ") - - // Lowercase. - s = strings.ToLower(s) - - // Remove apostrophes. - s = strings.ReplaceAll(s, "ß", "ss") - s = strings.ReplaceAll(s, "'s", "s") - s = strings.ReplaceAll(s, "`s", "s") - s = strings.ReplaceAll(s, "´s", "s") - - // Remove diacritics. - if out, _, err := transform.String(diactricsFix, s); err == nil { - s = out - } - - // Clean spaces. - s = spaceNumbers(s) - s = addSpaces(s) - s = collapseSpaces(s) - - return s - } -} - -func spaceNumbers(s string) string { - if len(s) == 0 { - return s - } - - isDigit := func(b rune) bool { - switch b { - case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': - return true - } - return false - } - - b := strings.Builder{} - - var first rune - for _, c := range s { - first = c - break - } - - digit := isDigit(first) - - // Range over runes. - for _, c := range s { - thisDigit := isDigit(c) - if thisDigit != digit { - b.WriteByte(' ') - digit = thisDigit - } - b.WriteRune(c) - } - - return b.String() -} - -func addSpaces(s string) string { - needsSpace := func(r rune) bool { - switch r { - case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')', - '-', '_', '+', '=', '[', '{', ']', '}', '\\', '|', - ':', ';', '"', '\'', ',', '<', '.', '>', '?', '/': - return true - } - return false - } - - b := strings.Builder{} - - // Range over runes. - for _, r := range s { - if needsSpace(r) { - b.WriteRune(' ') - b.WriteRune(r) - b.WriteRune(' ') - } else { - b.WriteRune(r) - } - } - - return b.String() -} - -func collapseSpaces(s string) string { - // Trim leading and trailing spaces. - s = strings.TrimSpace(s) - - b := strings.Builder{} - wasSpace := false - - // Range over runes. - for _, c := range s { - if unicode.IsSpace(c) { - wasSpace = true - continue - } else if wasSpace { - wasSpace = false - b.WriteRune(' ') - } - b.WriteRune(c) - } - - return b.String() +// The basic sanitizer: +// * lower-case +// * put spaces around numbers +// * put slaces around punctuation +// * collapse multiple spaces +func BasicSanitizer(s string) string { + s = strings.ToLower(s) + s = sanitize.SpaceNumbers(s) + s = sanitize.SpacePunctuation(s) + s = sanitize.CollapseSpaces(s) + return s } diff --git a/sanitize/sanitize.go b/sanitize/sanitize.go new file mode 100644 index 0000000..b786eb1 --- /dev/null +++ b/sanitize/sanitize.go @@ -0,0 +1,91 @@ +package sanitize + +import ( + "strings" + "unicode" +) + +func SpaceNumbers(s string) string { + if len(s) == 0 { + return s + } + + isDigit := func(b rune) bool { + switch b { + case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': + return true + } + return false + } + + b := strings.Builder{} + + var first rune + for _, c := range s { + first = c + break + } + + digit := isDigit(first) + + // Range over runes. + for _, c := range s { + thisDigit := isDigit(c) + if thisDigit != digit { + b.WriteByte(' ') + digit = thisDigit + } + b.WriteRune(c) + } + + return b.String() +} + +func SpacePunctuation(s string) string { + needsSpace := func(r rune) bool { + switch r { + case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')', + '-', '_', '+', '=', '[', '{', ']', '}', '\\', '|', + ':', ';', '"', '\'', ',', '<', '.', '>', '?', '/': + return true + } + return false + } + + b := strings.Builder{} + + // Range over runes. + for _, r := range s { + if needsSpace(r) { + b.WriteRune(' ') + b.WriteRune(r) + b.WriteRune(' ') + } else { + b.WriteRune(r) + } + } + + return b.String() +} + +func CollapseSpaces(s string) string { + // Trim leading and trailing spaces. + s = strings.TrimSpace(s) + + b := strings.Builder{} + wasSpace := false + + // Range over runes. + for _, c := range s { + if unicode.IsSpace(c) { + wasSpace = true + continue + } else if wasSpace { + wasSpace = false + b.WriteRune(' ') + } + b.WriteRune(c) + } + + return b.String() +} diff --git a/sanitize_test.go b/sanitize_test.go index e61114b..82aca01 100644 --- a/sanitize_test.go +++ b/sanitize_test.go @@ -3,7 +3,7 @@ package tagengine import "testing" func TestSanitize(t *testing.T) { - sanitize := newSanitizer() + sanitize := BasicSanitizer type Case struct { In string @@ -17,12 +17,7 @@ func TestSanitize(t *testing.T) { {"abc123xyz", "abc 123 xyz"}, {"1f2", "1 f 2"}, {" abc", "abc"}, - {" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"}, - {" Pott`s gin königs beer;SOJU ", "potts gin konigs beer ; soju"}, - {"brot & brötchen", "brot & brotchen"}, - {"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"}, - {"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"}, - {"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"}, + {" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"}, {"€", "€"}, }