package tagengine import ( "sort" ) type RuleSet struct { root *node maxNgram int sanitize func(...string) string rules []*Rule } func NewRuleSet() *RuleSet { return &RuleSet{ root: &node{ Token: "/", Children: map[string]*node{}, }, sanitize: newSanitizer(), rules: []*Rule{}, } } func NewRuleSetFromList(rules []Rule) *RuleSet { rs := NewRuleSet() rs.AddRule(rules...) return rs } func (t *RuleSet) Add(ruleOrGroup ...interface{}) { for _, ix := range ruleOrGroup { switch x := ix.(type) { case Rule: t.AddRule(x) case RuleGroup: t.AddRuleGroup(x) default: panic("Add expects either Rule or RuleGroup objects.") } } } func (t *RuleSet) AddRule(rules ...Rule) { for _, rule := range rules { rule := rule // Make sure rule is well-formed. rule.normalize() // Update maxNgram. N := rule.maxNGram() if N > t.maxNgram { t.maxNgram = N } t.rules = append(t.rules, &rule) t.root.AddRule(&rule) } } func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) { for _, rg := range ruleGroups { t.AddRule(rg.ToList()...) } } // MatchRules will return a list of all matching rules. The rules are sorted by // the match's "score". The best match will be first. func (t *RuleSet) MatchRules(input string) (rules []*Rule) { input = t.sanitize(input) tokens := Tokenize(input, t.maxNgram) rules = t.root.Match(tokens) if len(rules) == 0 { return rules } // Check excludes. l := rules[:0] for _, r := range rules { if !r.isExcluded(tokens) { l = append(l, r) } } rules = l // Sort rules descending. sort.Slice(rules, func(i, j int) bool { return ruleLess(rules[j], rules[i]) }) // Update rule stats. if len(rules) > 0 { rules[0].FirstCount++ for _, r := range rules { r.MatchCount++ } } return rules } type Match struct { Tag string Confidence float64 // In the range (0,1]. } // Return a list of matches with confidence. func (t *RuleSet) Match(input string) []Match { rules := t.MatchRules(input) if len(rules) == 0 { return []Match{} } if len(rules) == 1 { return []Match{{ Tag: rules[0].Tag, Confidence: 1, }} } // Create list of blocked tags. blocks := map[string]struct{}{} for _, rule := range rules { for _, tag := range rule.Blocks { blocks[tag] = struct{}{} } } // Remove rules for blocked tags. iOut := 0 for _, rule := range rules { if _, ok := blocks[rule.Tag]; ok { continue } rules[iOut] = rule iOut++ } rules = rules[:iOut] // Matches by index. matches := map[string]int{} out := []Match{} sum := float64(0) for _, rule := range rules { idx, ok := matches[rule.Tag] if !ok { idx = len(matches) matches[rule.Tag] = idx out = append(out, Match{Tag: rule.Tag}) } out[idx].Confidence += float64(rule.score) sum += float64(rule.score) } for i := range out { out[i].Confidence /= sum } return out } // ListRules returns rules used in the ruleset sorted by the rules' // FirstCount. This is the number of times the given rule was the best match to // an input. func (t *RuleSet) ListRules() []*Rule { sort.Slice(t.rules, func(i, j int) bool { if t.rules[j].FirstCount != t.rules[i].FirstCount { return t.rules[j].FirstCount < t.rules[i].FirstCount } if t.rules[j].MatchCount != t.rules[i].MatchCount { return t.rules[j].MatchCount < t.rules[i].MatchCount } return t.rules[j].Tag < t.rules[i].Tag }) return t.rules }