tagengine/ruleset.go

163 lines
3.1 KiB
Go
Raw Permalink Normal View History

2021-09-09 10:25:53 +00:00
package tagengine
import (
"sort"
)
type RuleSet struct {
root *node
maxNgram int
2023-10-13 11:06:20 +00:00
sanitize func(string) string
2021-09-09 10:25:53 +00:00
rules []*Rule
}
func NewRuleSet() *RuleSet {
return &RuleSet{
root: &node{
Token: "/",
Children: map[string]*node{},
},
2023-10-13 11:06:20 +00:00
sanitize: BasicSanitizer,
2021-09-09 10:25:53 +00:00
rules: []*Rule{},
}
}
func NewRuleSetFromList(rules []Rule) *RuleSet {
rs := NewRuleSet()
rs.AddRule(rules...)
return rs
}
func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
for _, ix := range ruleOrGroup {
switch x := ix.(type) {
case Rule:
t.AddRule(x)
case RuleGroup:
t.AddRuleGroup(x)
default:
panic("Add expects either Rule or RuleGroup objects.")
}
}
}
func (t *RuleSet) AddRule(rules ...Rule) {
for _, rule := range rules {
rule := rule
// Make sure rule is well-formed.
2023-10-13 11:06:20 +00:00
rule.normalize(t.sanitize)
2021-09-09 10:25:53 +00:00
// Update maxNgram.
N := rule.maxNGram()
if N > t.maxNgram {
t.maxNgram = N
}
t.rules = append(t.rules, &rule)
t.root.AddRule(&rule)
}
}
func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
for _, rg := range ruleGroups {
t.AddRule(rg.ToList()...)
}
}
// MatchRules will return a list of all matching rules. The rules are sorted by
2023-10-13 11:06:20 +00:00
// the match's Score. The best match will be first.
2021-09-09 10:25:53 +00:00
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
input = t.sanitize(input)
tokens := Tokenize(input, t.maxNgram)
rules = t.root.Match(tokens)
if len(rules) == 0 {
return rules
}
// Check excludes.
l := rules[:0]
for _, r := range rules {
if !r.isExcluded(tokens) {
l = append(l, r)
}
}
rules = l
// Sort rules descending.
sort.Slice(rules, func(i, j int) bool {
return ruleLess(rules[j], rules[i])
})
return rules
}
type Match struct {
2023-10-13 11:06:20 +00:00
Tag string
// Confidence is used to sort all matches, and is normalized so the sum of
// Confidence values for all matches is 1. Confidence is relative to the
// number of matches and the size of matches in terms of number of tokens.
2021-09-09 10:25:53 +00:00
Confidence float64 // In the range (0,1].
}
2023-10-13 11:06:20 +00:00
// Return a list of matches with confidence. This is useful if you'd like to
// find the best matching rule out of all the matched rules.
//
// If you just want to find all matching rules, then use MatchRules.
2021-09-09 10:25:53 +00:00
func (t *RuleSet) Match(input string) []Match {
rules := t.MatchRules(input)
if len(rules) == 0 {
return []Match{}
}
if len(rules) == 1 {
return []Match{{
Tag: rules[0].Tag,
Confidence: 1,
}}
}
// Create list of blocked tags.
blocks := map[string]struct{}{}
for _, rule := range rules {
for _, tag := range rule.Blocks {
blocks[tag] = struct{}{}
}
}
// Remove rules for blocked tags.
iOut := 0
for _, rule := range rules {
if _, ok := blocks[rule.Tag]; ok {
continue
}
rules[iOut] = rule
iOut++
}
rules = rules[:iOut]
// Matches by index.
matches := map[string]int{}
out := []Match{}
sum := float64(0)
for _, rule := range rules {
idx, ok := matches[rule.Tag]
if !ok {
idx = len(matches)
matches[rule.Tag] = idx
out = append(out, Match{Tag: rule.Tag})
}
2023-10-13 11:06:20 +00:00
out[idx].Confidence += float64(rule.Score)
sum += float64(rule.Score)
2021-09-09 10:25:53 +00:00
}
for i := range out {
out[i].Confidence /= sum
}
return out
}