160 lines
3.1 KiB
Go
160 lines
3.1 KiB
Go
|
package tagengine
|
||
|
|
||
|
type Rule struct {
|
||
|
// The purpose of a Rule is to attach it's Tag to matching text.
|
||
|
Tag string
|
||
|
|
||
|
// Includes is a list of strings that must be found in the input in order to
|
||
|
// match.
|
||
|
Includes []string
|
||
|
|
||
|
// Excludes is a list of strings that can exclude a match for this rule.
|
||
|
Excludes []string
|
||
|
|
||
|
// Blocks: If this rule is matched, then it will block matches of any tags
|
||
|
// listed here.
|
||
|
Blocks []string
|
||
|
|
||
|
// The Score encodes the complexity of the Rule. A higher score indicates a
|
||
|
// more specific match. A Rule more includes, or includes with multiple words
|
||
|
// should havee a higher Score than a Rule with fewer includes or less
|
||
|
// complex includes.
|
||
|
Score int
|
||
|
|
||
|
excludes map[string]struct{}
|
||
|
}
|
||
|
|
||
|
func NewRule(tag string) Rule {
|
||
|
return Rule{Tag: tag}
|
||
|
}
|
||
|
|
||
|
func (r Rule) Inc(l ...string) Rule {
|
||
|
return Rule{
|
||
|
Tag: r.Tag,
|
||
|
Includes: append(r.Includes, l...),
|
||
|
Excludes: r.Excludes,
|
||
|
Blocks: r.Blocks,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (r Rule) Exc(l ...string) Rule {
|
||
|
return Rule{
|
||
|
Tag: r.Tag,
|
||
|
Includes: r.Includes,
|
||
|
Excludes: append(r.Excludes, l...),
|
||
|
Blocks: r.Blocks,
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (r Rule) Block(l ...string) Rule {
|
||
|
return Rule{
|
||
|
Tag: r.Tag,
|
||
|
Includes: r.Includes,
|
||
|
Excludes: r.Excludes,
|
||
|
Blocks: append(r.Blocks, l...),
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (rule *Rule) normalize(sanitize func(string) string) {
|
||
|
for i, token := range rule.Includes {
|
||
|
rule.Includes[i] = sanitize(token)
|
||
|
}
|
||
|
for i, token := range rule.Excludes {
|
||
|
rule.Excludes[i] = sanitize(token)
|
||
|
}
|
||
|
|
||
|
sortTokens(rule.Includes)
|
||
|
sortTokens(rule.Excludes)
|
||
|
|
||
|
rule.excludes = map[string]struct{}{}
|
||
|
for _, s := range rule.Excludes {
|
||
|
rule.excludes[s] = struct{}{}
|
||
|
}
|
||
|
|
||
|
rule.Score = rule.computeScore()
|
||
|
}
|
||
|
|
||
|
func (r Rule) maxNGram() int {
|
||
|
max := 0
|
||
|
for _, s := range r.Includes {
|
||
|
n := ngramLength(s)
|
||
|
if n > max {
|
||
|
max = n
|
||
|
}
|
||
|
}
|
||
|
for _, s := range r.Excludes {
|
||
|
n := ngramLength(s)
|
||
|
if n > max {
|
||
|
max = n
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return max
|
||
|
}
|
||
|
|
||
|
func (r Rule) isExcluded(tokens []string) bool {
|
||
|
// This is most often the case.
|
||
|
if len(r.excludes) == 0 {
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
for _, s := range tokens {
|
||
|
if _, ok := r.excludes[s]; ok {
|
||
|
return true
|
||
|
}
|
||
|
}
|
||
|
return false
|
||
|
}
|
||
|
|
||
|
func (r Rule) computeScore() (score int) {
|
||
|
for _, token := range r.Includes {
|
||
|
n := ngramLength(token)
|
||
|
score += n * (n + 1) / 2
|
||
|
}
|
||
|
return score
|
||
|
}
|
||
|
|
||
|
func ruleLess(lhs, rhs *Rule) bool {
|
||
|
// If scores differ, sort by score.
|
||
|
if lhs.Score != rhs.Score {
|
||
|
return lhs.Score < rhs.Score
|
||
|
}
|
||
|
|
||
|
// If include depth differs, sort by depth.
|
||
|
lDepth := len(lhs.Includes)
|
||
|
rDepth := len(rhs.Includes)
|
||
|
|
||
|
if lDepth != rDepth {
|
||
|
return lDepth < rDepth
|
||
|
}
|
||
|
|
||
|
// If exclude depth differs, sort by depth.
|
||
|
lDepth = len(lhs.Excludes)
|
||
|
rDepth = len(rhs.Excludes)
|
||
|
|
||
|
if lDepth != rDepth {
|
||
|
return lDepth < rDepth
|
||
|
}
|
||
|
|
||
|
// Sort alphabetically by includes.
|
||
|
for i := range lhs.Includes {
|
||
|
if lhs.Includes[i] != rhs.Includes[i] {
|
||
|
return lhs.Includes[i] < rhs.Includes[i]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Sort by alphabetically by excludes.
|
||
|
for i := range lhs.Excludes {
|
||
|
if lhs.Excludes[i] != rhs.Excludes[i] {
|
||
|
return lhs.Excludes[i] < rhs.Excludes[i]
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Sort by tag.
|
||
|
if lhs.Tag != rhs.Tag {
|
||
|
return lhs.Tag < rhs.Tag
|
||
|
}
|
||
|
|
||
|
return false
|
||
|
}
|