package tagengine type Rule struct { // The purpose of a Rule is to attach it's Tag to matching text. Tag string // Includes is a list of strings that must be found in the input in order to // match. Includes []string // Excludes is a list of strings that can exclude a match for this rule. Excludes []string // Blocks: If this rule is matched, then it will block matches of any tags // listed here. Blocks []string // The Score encodes the complexity of the Rule. A higher score indicates a // more specific match. A Rule more includes, or includes with multiple words // should havee a higher Score than a Rule with fewer includes or less // complex includes. Score int excludes map[string]struct{} } func NewRule(tag string) Rule { return Rule{Tag: tag} } func (r Rule) Inc(l ...string) Rule { return Rule{ Tag: r.Tag, Includes: append(r.Includes, l...), Excludes: r.Excludes, Blocks: r.Blocks, } } func (r Rule) Exc(l ...string) Rule { return Rule{ Tag: r.Tag, Includes: r.Includes, Excludes: append(r.Excludes, l...), Blocks: r.Blocks, } } func (r Rule) Block(l ...string) Rule { return Rule{ Tag: r.Tag, Includes: r.Includes, Excludes: r.Excludes, Blocks: append(r.Blocks, l...), } } func (rule *Rule) normalize(sanitize func(string) string) { for i, token := range rule.Includes { rule.Includes[i] = sanitize(token) } for i, token := range rule.Excludes { rule.Excludes[i] = sanitize(token) } sortTokens(rule.Includes) sortTokens(rule.Excludes) rule.excludes = map[string]struct{}{} for _, s := range rule.Excludes { rule.excludes[s] = struct{}{} } rule.Score = rule.computeScore() } func (r Rule) maxNGram() int { max := 0 for _, s := range r.Includes { n := ngramLength(s) if n > max { max = n } } for _, s := range r.Excludes { n := ngramLength(s) if n > max { max = n } } return max } func (r Rule) isExcluded(tokens []string) bool { // This is most often the case. if len(r.excludes) == 0 { return false } for _, s := range tokens { if _, ok := r.excludes[s]; ok { return true } } return false } func (r Rule) computeScore() (score int) { for _, token := range r.Includes { n := ngramLength(token) score += n * (n + 1) / 2 } return score } func ruleLess(lhs, rhs *Rule) bool { // If scores differ, sort by score. if lhs.Score != rhs.Score { return lhs.Score < rhs.Score } // If include depth differs, sort by depth. lDepth := len(lhs.Includes) rDepth := len(rhs.Includes) if lDepth != rDepth { return lDepth < rDepth } // If exclude depth differs, sort by depth. lDepth = len(lhs.Excludes) rDepth = len(rhs.Excludes) if lDepth != rDepth { return lDepth < rDepth } // Sort alphabetically by includes. for i := range lhs.Includes { if lhs.Includes[i] != rhs.Includes[i] { return lhs.Includes[i] < rhs.Includes[i] } } // Sort by alphabetically by excludes. for i := range lhs.Excludes { if lhs.Excludes[i] != rhs.Excludes[i] { return lhs.Excludes[i] < rhs.Excludes[i] } } // Sort by tag. if lhs.Tag != rhs.Tag { return lhs.Tag < rhs.Tag } return false }