tagengine/rule.go

package tagengine

type Rule struct {
	// The purpose of a Rule is to attach it's Tag to matching text.
	Tag string

	// Includes is a list of strings that must be found in the input in order to
	// match.
	Includes []string

	// Excludes is a list of strings that can exclude a match for this rule.
	Excludes []string

	// Blocks: If this rule is matched, then it will block matches of any tags
	// listed here.
	Blocks []string

	// The Score encodes the complexity of the Rule. A higher score indicates a
	// more specific match. A Rule more includes, or includes with multiple words
	// should havee a higher Score than a Rule with fewer includes or less
	// complex includes.
	Score int

	excludes map[string]struct{}
}

func NewRule(tag string) Rule {
	return Rule{Tag: tag}
}

func (r Rule) Inc(l ...string) Rule {
	return Rule{
		Tag:      r.Tag,
		Includes: append(r.Includes, l...),
		Excludes: r.Excludes,
		Blocks:   r.Blocks,
	}
}

func (r Rule) Exc(l ...string) Rule {
	return Rule{
		Tag:      r.Tag,
		Includes: r.Includes,
		Excludes: append(r.Excludes, l...),
		Blocks:   r.Blocks,
	}
}

func (r Rule) Block(l ...string) Rule {
	return Rule{
		Tag:      r.Tag,
		Includes: r.Includes,
		Excludes: r.Excludes,
		Blocks:   append(r.Blocks, l...),
	}
}

func (rule *Rule) normalize(sanitize func(string) string) {
	for i, token := range rule.Includes {
		rule.Includes[i] = sanitize(token)
	}
	for i, token := range rule.Excludes {
		rule.Excludes[i] = sanitize(token)
	}

	sortTokens(rule.Includes)
	sortTokens(rule.Excludes)

	rule.excludes = map[string]struct{}{}
	for _, s := range rule.Excludes {
		rule.excludes[s] = struct{}{}
	}

	rule.Score = rule.computeScore()
}

func (r Rule) maxNGram() int {
	max := 0
	for _, s := range r.Includes {
		n := ngramLength(s)
		if n > max {
			max = n
		}
	}
	for _, s := range r.Excludes {
		n := ngramLength(s)
		if n > max {
			max = n
		}
	}

	return max
}

func (r Rule) isExcluded(tokens []string) bool {
	// This is most often the case.
	if len(r.excludes) == 0 {
		return false
	}

	for _, s := range tokens {
		if _, ok := r.excludes[s]; ok {
			return true
		}
	}
	return false
}

func (r Rule) computeScore() (score int) {
	for _, token := range r.Includes {
		n := ngramLength(token)
		score += n * (n + 1) / 2
	}
	return score
}

func ruleLess(lhs, rhs *Rule) bool {
	// If scores differ, sort by score.
	if lhs.Score != rhs.Score {
		return lhs.Score < rhs.Score
	}

	// If include depth differs, sort by depth.
	lDepth := len(lhs.Includes)
	rDepth := len(rhs.Includes)

	if lDepth != rDepth {
		return lDepth < rDepth
	}

	// If exclude depth differs, sort by depth.
	lDepth = len(lhs.Excludes)
	rDepth = len(rhs.Excludes)

	if lDepth != rDepth {
		return lDepth < rDepth
	}

	// Sort alphabetically by includes.
	for i := range lhs.Includes {
		if lhs.Includes[i] != rhs.Includes[i] {
			return lhs.Includes[i] < rhs.Includes[i]
		}
	}

	// Sort by alphabetically by excludes.
	for i := range lhs.Excludes {
		if lhs.Excludes[i] != rhs.Excludes[i] {
			return lhs.Excludes[i] < rhs.Excludes[i]
		}
	}

	// Sort by tag.
	if lhs.Tag != rhs.Tag {
		return lhs.Tag < rhs.Tag
	}

	return false
}
Forked from subrubia 2021-09-09 10:25:53 +00:00			`package tagengine`

			`type Rule struct {`
Cleanup for v1. 2023-10-13 11:06:20 +00:00			`// The purpose of a Rule is to attach it's Tag to matching text.`
			`Tag string`

			`// Includes is a list of strings that must be found in the input in order to`
			`// match.`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`Includes []string`
Cleanup for v1. 2023-10-13 11:06:20 +00:00
			`// Excludes is a list of strings that can exclude a match for this rule.`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`Excludes []string`

Cleanup for v1. 2023-10-13 11:06:20 +00:00			`// Blocks: If this rule is matched, then it will block matches of any tags`
			`// listed here.`
			`Blocks []string`
Forked from subrubia 2021-09-09 10:25:53 +00:00
Cleanup for v1. 2023-10-13 11:06:20 +00:00			`// The Score encodes the complexity of the Rule. A higher score indicates a`
			`// more specific match. A Rule more includes, or includes with multiple words`
			`// should havee a higher Score than a Rule with fewer includes or less`
			`// complex includes.`
			`Score int`
Forked from subrubia 2021-09-09 10:25:53 +00:00
			`excludes map[string]struct{}`
			`}`

			`func NewRule(tag string) Rule {`
			`return Rule{Tag: tag}`
			`}`

			`func (r Rule) Inc(l ...string) Rule {`
			`return Rule{`
			`Tag: r.Tag,`
Cleanup for v1. 2023-10-13 11:06:20 +00:00			`Includes: append(r.Includes, l...),`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`Excludes: r.Excludes,`
			`Blocks: r.Blocks,`
			`}`
			`}`

			`func (r Rule) Exc(l ...string) Rule {`
			`return Rule{`
			`Tag: r.Tag,`
			`Includes: r.Includes,`
Cleanup for v1. 2023-10-13 11:06:20 +00:00			`Excludes: append(r.Excludes, l...),`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`Blocks: r.Blocks,`
			`}`
			`}`

			`func (r Rule) Block(l ...string) Rule {`
			`return Rule{`
			`Tag: r.Tag,`
			`Includes: r.Includes,`
			`Excludes: r.Excludes,`
Cleanup for v1. 2023-10-13 11:06:20 +00:00			`Blocks: append(r.Blocks, l...),`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`}`
			`}`

Cleanup for v1. 2023-10-13 11:06:20 +00:00			`func (rule *Rule) normalize(sanitize func(string) string) {`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`for i, token := range rule.Includes {`
			`rule.Includes[i] = sanitize(token)`
			`}`
			`for i, token := range rule.Excludes {`
			`rule.Excludes[i] = sanitize(token)`
			`}`

			`sortTokens(rule.Includes)`
			`sortTokens(rule.Excludes)`

			`rule.excludes = map[string]struct{}{}`
			`for _, s := range rule.Excludes {`
			`rule.excludes[s] = struct{}{}`
			`}`

Cleanup for v1. 2023-10-13 11:06:20 +00:00			`rule.Score = rule.computeScore()`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`}`

			`func (r Rule) maxNGram() int {`
			`max := 0`
			`for _, s := range r.Includes {`
			`n := ngramLength(s)`
			`if n > max {`
			`max = n`
			`}`
			`}`
			`for _, s := range r.Excludes {`
			`n := ngramLength(s)`
			`if n > max {`
			`max = n`
			`}`
			`}`

			`return max`
			`}`

			`func (r Rule) isExcluded(tokens []string) bool {`
			`// This is most often the case.`
			`if len(r.excludes) == 0 {`
			`return false`
			`}`

			`for _, s := range tokens {`
			`if _, ok := r.excludes[s]; ok {`
			`return true`
			`}`
			`}`
			`return false`
			`}`

			`func (r Rule) computeScore() (score int) {`
			`for _, token := range r.Includes {`
			`n := ngramLength(token)`
			`score += n * (n + 1) / 2`
			`}`
			`return score`
			`}`

			`func ruleLess(lhs, rhs *Rule) bool {`
			`// If scores differ, sort by score.`
Cleanup for v1. 2023-10-13 11:06:20 +00:00			`if lhs.Score != rhs.Score {`
			`return lhs.Score < rhs.Score`
Forked from subrubia 2021-09-09 10:25:53 +00:00			`}`

			`// If include depth differs, sort by depth.`
			`lDepth := len(lhs.Includes)`
			`rDepth := len(rhs.Includes)`

			`if lDepth != rDepth {`
			`return lDepth < rDepth`
			`}`

			`// If exclude depth differs, sort by depth.`
			`lDepth = len(lhs.Excludes)`
			`rDepth = len(rhs.Excludes)`

			`if lDepth != rDepth {`
			`return lDepth < rDepth`
			`}`

			`// Sort alphabetically by includes.`
			`for i := range lhs.Includes {`
			`if lhs.Includes[i] != rhs.Includes[i] {`
			`return lhs.Includes[i] < rhs.Includes[i]`
			`}`
			`}`

			`// Sort by alphabetically by excludes.`
			`for i := range lhs.Excludes {`
			`if lhs.Excludes[i] != rhs.Excludes[i] {`
			`return lhs.Excludes[i] < rhs.Excludes[i]`
			`}`
			`}`

			`// Sort by tag.`
			`if lhs.Tag != rhs.Tag {`
			`return lhs.Tag < rhs.Tag`
			`}`

			`return false`
			`}`