go/tagengine/ruleset.go

package tagengine

import (
	"sort"
)

type RuleSet struct {
	root     *node
	maxNgram int
	sanitize func(string) string
	rules    []*Rule
}

func NewRuleSet() *RuleSet {
	return &RuleSet{
		root: &node{
			Token:    "/",
			Children: map[string]*node{},
		},
		sanitize: BasicSanitizer,
		rules:    []*Rule{},
	}
}

func NewRuleSetFromList(rules []Rule) *RuleSet {
	rs := NewRuleSet()
	rs.AddRule(rules...)
	return rs
}

func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
	for _, ix := range ruleOrGroup {
		switch x := ix.(type) {
		case Rule:
			t.AddRule(x)
		case RuleGroup:
			t.AddRuleGroup(x)
		default:
			panic("Add expects either Rule or RuleGroup objects.")
		}
	}
}

func (t *RuleSet) AddRule(rules ...Rule) {
	for _, rule := range rules {
		rule := rule

		// Make sure rule is well-formed.
		rule.normalize(t.sanitize)

		// Update maxNgram.
		N := rule.maxNGram()
		if N > t.maxNgram {
			t.maxNgram = N
		}

		t.rules = append(t.rules, &rule)
		t.root.AddRule(&rule)
	}
}

func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
	for _, rg := range ruleGroups {
		t.AddRule(rg.ToList()...)
	}
}

// MatchRules will return a list of all matching rules. The rules are sorted by
// the match's Score. The best match will be first.
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
	input = t.sanitize(input)
	tokens := Tokenize(input, t.maxNgram)

	rules = t.root.Match(tokens)
	if len(rules) == 0 {
		return rules
	}

	// Check excludes.
	l := rules[:0]
	for _, r := range rules {
		if !r.isExcluded(tokens) {
			l = append(l, r)
		}
	}

	rules = l

	// Sort rules descending.
	sort.Slice(rules, func(i, j int) bool {
		return ruleLess(rules[j], rules[i])
	})

	return rules
}

type Match struct {
	Tag string

	// Confidence is used to sort all matches, and is normalized so the sum of
	// Confidence values for all matches is 1. Confidence is relative to the
	// number of matches and the size of matches in terms of number of tokens.
	Confidence float64 // In the range (0,1].
}

// Return a list of matches with confidence. This is useful if you'd like to
// find the best matching rule out of all the matched rules.
//
// If you just want to find all matching rules, then use MatchRules.
func (t *RuleSet) Match(input string) []Match {
	rules := t.MatchRules(input)
	if len(rules) == 0 {
		return []Match{}
	}
	if len(rules) == 1 {
		return []Match{{
			Tag:        rules[0].Tag,
			Confidence: 1,
		}}
	}

	// Create list of blocked tags.
	blocks := map[string]struct{}{}
	for _, rule := range rules {
		for _, tag := range rule.Blocks {
			blocks[tag] = struct{}{}
		}
	}

	// Remove rules for blocked tags.
	iOut := 0
	for _, rule := range rules {
		if _, ok := blocks[rule.Tag]; ok {
			continue
		}
		rules[iOut] = rule
		iOut++
	}
	rules = rules[:iOut]

	// Matches by index.
	matches := map[string]int{}
	out := []Match{}
	sum := float64(0)

	for _, rule := range rules {
		idx, ok := matches[rule.Tag]
		if !ok {
			idx = len(matches)
			matches[rule.Tag] = idx
			out = append(out, Match{Tag: rule.Tag})
		}
		out[idx].Confidence += float64(rule.Score)
		sum += float64(rule.Score)
	}

	for i := range out {
		out[i].Confidence /= sum
	}

	return out
}
wip 2024-11-11 05:36:55 +00:00			`package tagengine`

			`import (`
			`"sort"`
			`)`

			`type RuleSet struct {`
			`root *node`
			`maxNgram int`
			`sanitize func(string) string`
			`rules []*Rule`
			`}`

			`func NewRuleSet() *RuleSet {`
			`return &RuleSet{`
			`root: &node{`
			`Token: "/",`
			`Children: map[string]*node{},`
			`},`
			`sanitize: BasicSanitizer,`
			`rules: []*Rule{},`
			`}`
			`}`

			`func NewRuleSetFromList(rules []Rule) *RuleSet {`
			`rs := NewRuleSet()`
			`rs.AddRule(rules...)`
			`return rs`
			`}`

			`func (t *RuleSet) Add(ruleOrGroup ...interface{}) {`
			`for _, ix := range ruleOrGroup {`
			`switch x := ix.(type) {`
			`case Rule:`
			`t.AddRule(x)`
			`case RuleGroup:`
			`t.AddRuleGroup(x)`
			`default:`
			`panic("Add expects either Rule or RuleGroup objects.")`
			`}`
			`}`
			`}`

			`func (t *RuleSet) AddRule(rules ...Rule) {`
			`for _, rule := range rules {`
			`rule := rule`

			`// Make sure rule is well-formed.`
			`rule.normalize(t.sanitize)`

			`// Update maxNgram.`
			`N := rule.maxNGram()`
			`if N > t.maxNgram {`
			`t.maxNgram = N`
			`}`

			`t.rules = append(t.rules, &rule)`
			`t.root.AddRule(&rule)`
			`}`
			`}`

			`func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {`
			`for _, rg := range ruleGroups {`
			`t.AddRule(rg.ToList()...)`
			`}`
			`}`

			`// MatchRules will return a list of all matching rules. The rules are sorted by`
			`// the match's Score. The best match will be first.`
			`func (t RuleSet) MatchRules(input string) (rules []Rule) {`
			`input = t.sanitize(input)`
			`tokens := Tokenize(input, t.maxNgram)`

			`rules = t.root.Match(tokens)`
			`if len(rules) == 0 {`
			`return rules`
			`}`

			`// Check excludes.`
			`l := rules[:0]`
			`for _, r := range rules {`
			`if !r.isExcluded(tokens) {`
			`l = append(l, r)`
			`}`
			`}`

			`rules = l`

			`// Sort rules descending.`
			`sort.Slice(rules, func(i, j int) bool {`
			`return ruleLess(rules[j], rules[i])`
			`})`

			`return rules`
			`}`

			`type Match struct {`
			`Tag string`

			`// Confidence is used to sort all matches, and is normalized so the sum of`
			`// Confidence values for all matches is 1. Confidence is relative to the`
			`// number of matches and the size of matches in terms of number of tokens.`
			`Confidence float64 // In the range (0,1].`
			`}`

			`// Return a list of matches with confidence. This is useful if you'd like to`
			`// find the best matching rule out of all the matched rules.`
			`//`
			`// If you just want to find all matching rules, then use MatchRules.`
			`func (t *RuleSet) Match(input string) []Match {`
			`rules := t.MatchRules(input)`
			`if len(rules) == 0 {`
			`return []Match{}`
			`}`
			`if len(rules) == 1 {`
			`return []Match{{`
			`Tag: rules[0].Tag,`
			`Confidence: 1,`
			`}}`
			`}`

			`// Create list of blocked tags.`
			`blocks := map[string]struct{}{}`
			`for _, rule := range rules {`
			`for _, tag := range rule.Blocks {`
			`blocks[tag] = struct{}{}`
			`}`
			`}`

			`// Remove rules for blocked tags.`
			`iOut := 0`
			`for _, rule := range rules {`
			`if _, ok := blocks[rule.Tag]; ok {`
			`continue`
			`}`
			`rules[iOut] = rule`
			`iOut++`
			`}`
			`rules = rules[:iOut]`

			`// Matches by index.`
			`matches := map[string]int{}`
			`out := []Match{}`
			`sum := float64(0)`

			`for _, rule := range rules {`
			`idx, ok := matches[rule.Tag]`
			`if !ok {`
			`idx = len(matches)`
			`matches[rule.Tag] = idx`
			`out = append(out, Match{Tag: rule.Tag})`
			`}`
			`out[idx].Confidence += float64(rule.Score)`
			`sum += float64(rule.Score)`
			`}`

			`for i := range out {`
			`out[i].Confidence /= sum`
			`}`

			`return out`
			`}`