package tagengine import ( "sort" ) type RuleSet struct { root *node maxNgram int sanitize func(string) string rules []*Rule } func NewRuleSet() *RuleSet { return &RuleSet{ root: &node{ Token: "/", Children: map[string]*node{}, }, sanitize: BasicSanitizer, rules: []*Rule{}, } } func NewRuleSetFromList(rules []Rule) *RuleSet { rs := NewRuleSet() rs.AddRule(rules...) return rs } func (t *RuleSet) Add(ruleOrGroup ...interface{}) { for _, ix := range ruleOrGroup { switch x := ix.(type) { case Rule: t.AddRule(x) case RuleGroup: t.AddRuleGroup(x) default: panic("Add expects either Rule or RuleGroup objects.") } } } func (t *RuleSet) AddRule(rules ...Rule) { for _, rule := range rules { rule := rule // Make sure rule is well-formed. rule.normalize(t.sanitize) // Update maxNgram. N := rule.maxNGram() if N > t.maxNgram { t.maxNgram = N } t.rules = append(t.rules, &rule) t.root.AddRule(&rule) } } func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) { for _, rg := range ruleGroups { t.AddRule(rg.ToList()...) } } // MatchRules will return a list of all matching rules. The rules are sorted by // the match's Score. The best match will be first. func (t *RuleSet) MatchRules(input string) (rules []*Rule) { input = t.sanitize(input) tokens := Tokenize(input, t.maxNgram) rules = t.root.Match(tokens) if len(rules) == 0 { return rules } // Check excludes. l := rules[:0] for _, r := range rules { if !r.isExcluded(tokens) { l = append(l, r) } } rules = l // Sort rules descending. sort.Slice(rules, func(i, j int) bool { return ruleLess(rules[j], rules[i]) }) return rules } type Match struct { Tag string // Confidence is used to sort all matches, and is normalized so the sum of // Confidence values for all matches is 1. Confidence is relative to the // number of matches and the size of matches in terms of number of tokens. Confidence float64 // In the range (0,1]. } // Return a list of matches with confidence. This is useful if you'd like to // find the best matching rule out of all the matched rules. // // If you just want to find all matching rules, then use MatchRules. func (t *RuleSet) Match(input string) []Match { rules := t.MatchRules(input) if len(rules) == 0 { return []Match{} } if len(rules) == 1 { return []Match{{ Tag: rules[0].Tag, Confidence: 1, }} } // Create list of blocked tags. blocks := map[string]struct{}{} for _, rule := range rules { for _, tag := range rule.Blocks { blocks[tag] = struct{}{} } } // Remove rules for blocked tags. iOut := 0 for _, rule := range rules { if _, ok := blocks[rule.Tag]; ok { continue } rules[iOut] = rule iOut++ } rules = rules[:iOut] // Matches by index. matches := map[string]int{} out := []Match{} sum := float64(0) for _, rule := range rules { idx, ok := matches[rule.Tag] if !ok { idx = len(matches) matches[rule.Tag] = idx out = append(out, Match{Tag: rule.Tag}) } out[idx].Confidence += float64(rule.Score) sum += float64(rule.Score) } for i := range out { out[i].Confidence /= sum } return out }