163 lines
3.1 KiB
Go
163 lines
3.1 KiB
Go
package tagengine
|
|
|
|
import (
|
|
"sort"
|
|
)
|
|
|
|
type RuleSet struct {
|
|
root *node
|
|
maxNgram int
|
|
sanitize func(string) string
|
|
rules []*Rule
|
|
}
|
|
|
|
func NewRuleSet() *RuleSet {
|
|
return &RuleSet{
|
|
root: &node{
|
|
Token: "/",
|
|
Children: map[string]*node{},
|
|
},
|
|
sanitize: BasicSanitizer,
|
|
rules: []*Rule{},
|
|
}
|
|
}
|
|
|
|
func NewRuleSetFromList(rules []Rule) *RuleSet {
|
|
rs := NewRuleSet()
|
|
rs.AddRule(rules...)
|
|
return rs
|
|
}
|
|
|
|
func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
|
|
for _, ix := range ruleOrGroup {
|
|
switch x := ix.(type) {
|
|
case Rule:
|
|
t.AddRule(x)
|
|
case RuleGroup:
|
|
t.AddRuleGroup(x)
|
|
default:
|
|
panic("Add expects either Rule or RuleGroup objects.")
|
|
}
|
|
}
|
|
}
|
|
|
|
func (t *RuleSet) AddRule(rules ...Rule) {
|
|
for _, rule := range rules {
|
|
rule := rule
|
|
|
|
// Make sure rule is well-formed.
|
|
rule.normalize(t.sanitize)
|
|
|
|
// Update maxNgram.
|
|
N := rule.maxNGram()
|
|
if N > t.maxNgram {
|
|
t.maxNgram = N
|
|
}
|
|
|
|
t.rules = append(t.rules, &rule)
|
|
t.root.AddRule(&rule)
|
|
}
|
|
}
|
|
|
|
func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
|
|
for _, rg := range ruleGroups {
|
|
t.AddRule(rg.ToList()...)
|
|
}
|
|
}
|
|
|
|
// MatchRules will return a list of all matching rules. The rules are sorted by
|
|
// the match's Score. The best match will be first.
|
|
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
|
input = t.sanitize(input)
|
|
tokens := Tokenize(input, t.maxNgram)
|
|
|
|
rules = t.root.Match(tokens)
|
|
if len(rules) == 0 {
|
|
return rules
|
|
}
|
|
|
|
// Check excludes.
|
|
l := rules[:0]
|
|
for _, r := range rules {
|
|
if !r.isExcluded(tokens) {
|
|
l = append(l, r)
|
|
}
|
|
}
|
|
|
|
rules = l
|
|
|
|
// Sort rules descending.
|
|
sort.Slice(rules, func(i, j int) bool {
|
|
return ruleLess(rules[j], rules[i])
|
|
})
|
|
|
|
return rules
|
|
}
|
|
|
|
type Match struct {
|
|
Tag string
|
|
|
|
// Confidence is used to sort all matches, and is normalized so the sum of
|
|
// Confidence values for all matches is 1. Confidence is relative to the
|
|
// number of matches and the size of matches in terms of number of tokens.
|
|
Confidence float64 // In the range (0,1].
|
|
}
|
|
|
|
// Return a list of matches with confidence. This is useful if you'd like to
|
|
// find the best matching rule out of all the matched rules.
|
|
//
|
|
// If you just want to find all matching rules, then use MatchRules.
|
|
func (t *RuleSet) Match(input string) []Match {
|
|
rules := t.MatchRules(input)
|
|
if len(rules) == 0 {
|
|
return []Match{}
|
|
}
|
|
if len(rules) == 1 {
|
|
return []Match{{
|
|
Tag: rules[0].Tag,
|
|
Confidence: 1,
|
|
}}
|
|
}
|
|
|
|
// Create list of blocked tags.
|
|
blocks := map[string]struct{}{}
|
|
for _, rule := range rules {
|
|
for _, tag := range rule.Blocks {
|
|
blocks[tag] = struct{}{}
|
|
}
|
|
}
|
|
|
|
// Remove rules for blocked tags.
|
|
iOut := 0
|
|
for _, rule := range rules {
|
|
if _, ok := blocks[rule.Tag]; ok {
|
|
continue
|
|
}
|
|
rules[iOut] = rule
|
|
iOut++
|
|
}
|
|
rules = rules[:iOut]
|
|
|
|
// Matches by index.
|
|
matches := map[string]int{}
|
|
out := []Match{}
|
|
sum := float64(0)
|
|
|
|
for _, rule := range rules {
|
|
idx, ok := matches[rule.Tag]
|
|
if !ok {
|
|
idx = len(matches)
|
|
matches[rule.Tag] = idx
|
|
out = append(out, Match{Tag: rule.Tag})
|
|
}
|
|
out[idx].Confidence += float64(rule.Score)
|
|
sum += float64(rule.Score)
|
|
}
|
|
|
|
for i := range out {
|
|
out[i].Confidence /= sum
|
|
}
|
|
|
|
return out
|
|
}
|