Cleanup for v1.

master v1.0.0
jdl 2023-10-13 13:06:20 +02:00
parent 0a77a882f1
commit f4927aaed4
9 changed files with 152 additions and 186 deletions

4
go.mod
View File

@ -1,5 +1,3 @@
module git.crumpington.com/public/tagengine
go 1.17
require golang.org/x/text v0.3.7
go 1.21.1

3
go.sum
View File

@ -1,3 +0,0 @@
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

View File

@ -7,7 +7,7 @@ import (
type node struct {
Token string
Matches []*Rule
Matches []*Rule // If a list of tokens reaches this node, it matches these.
Children map[string]*node
}

36
rule.go
View File

@ -1,15 +1,25 @@
package tagengine
type Rule struct {
Tag string
// The purpose of a Rule is to attach it's Tag to matching text.
Tag string
// Includes is a list of strings that must be found in the input in order to
// match.
Includes []string
// Excludes is a list of strings that can exclude a match for this rule.
Excludes []string
Blocks []string // List of blocked tags.
MatchCount int
FirstCount int
// Blocks: If this rule is matched, then it will block matches of any tags
// listed here.
Blocks []string
score int
// The Score encodes the complexity of the Rule. A higher score indicates a
// more specific match. A Rule more includes, or includes with multiple words
// should havee a higher Score than a Rule with fewer includes or less
// complex includes.
Score int
excludes map[string]struct{}
}
@ -21,7 +31,7 @@ func NewRule(tag string) Rule {
func (r Rule) Inc(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: l,
Includes: append(r.Includes, l...),
Excludes: r.Excludes,
Blocks: r.Blocks,
}
@ -31,7 +41,7 @@ func (r Rule) Exc(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: r.Includes,
Excludes: l,
Excludes: append(r.Excludes, l...),
Blocks: r.Blocks,
}
}
@ -41,13 +51,11 @@ func (r Rule) Block(l ...string) Rule {
Tag: r.Tag,
Includes: r.Includes,
Excludes: r.Excludes,
Blocks: l,
Blocks: append(r.Blocks, l...),
}
}
func (rule *Rule) normalize() {
sanitize := newSanitizer()
func (rule *Rule) normalize(sanitize func(string) string) {
for i, token := range rule.Includes {
rule.Includes[i] = sanitize(token)
}
@ -63,7 +71,7 @@ func (rule *Rule) normalize() {
rule.excludes[s] = struct{}{}
}
rule.score = rule.computeScore()
rule.Score = rule.computeScore()
}
func (r Rule) maxNGram() int {
@ -108,8 +116,8 @@ func (r Rule) computeScore() (score int) {
func ruleLess(lhs, rhs *Rule) bool {
// If scores differ, sort by score.
if lhs.score != rhs.score {
return lhs.score < rhs.score
if lhs.Score != rhs.Score {
return lhs.Score < rhs.Score
}
// If include depth differs, sort by depth.

View File

@ -1,7 +1,7 @@
package tagengine
// A RuleGroup can be converted into a list of rules. Each rule will point to
// the same tag, and have the same exclude set.
// the same tag, and have the same exclude set and blocks.
type RuleGroup struct {
Tag string
Includes [][]string
@ -31,7 +31,7 @@ func (g RuleGroup) Exc(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: g.Includes,
Excludes: l,
Excludes: append(g.Excludes, l...),
Blocks: g.Blocks,
}
}
@ -41,16 +41,17 @@ func (g RuleGroup) Block(l ...string) RuleGroup {
Tag: g.Tag,
Includes: g.Includes,
Excludes: g.Excludes,
Blocks: l,
Blocks: append(g.Blocks, l...),
}
}
func (rg RuleGroup) ToList() (l []Rule) {
for _, includes := range rg.Includes {
func (g RuleGroup) ToList() (l []Rule) {
for _, includes := range g.Includes {
l = append(l, Rule{
Tag: rg.Tag,
Excludes: rg.Excludes,
Tag: g.Tag,
Excludes: g.Excludes,
Includes: includes,
Blocks: g.Blocks,
})
}
return

View File

@ -7,7 +7,7 @@ import (
type RuleSet struct {
root *node
maxNgram int
sanitize func(...string) string
sanitize func(string) string
rules []*Rule
}
@ -17,7 +17,7 @@ func NewRuleSet() *RuleSet {
Token: "/",
Children: map[string]*node{},
},
sanitize: newSanitizer(),
sanitize: BasicSanitizer,
rules: []*Rule{},
}
}
@ -46,7 +46,7 @@ func (t *RuleSet) AddRule(rules ...Rule) {
rule := rule
// Make sure rule is well-formed.
rule.normalize()
rule.normalize(t.sanitize)
// Update maxNgram.
N := rule.maxNGram()
@ -66,7 +66,7 @@ func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
}
// MatchRules will return a list of all matching rules. The rules are sorted by
// the match's "score". The best match will be first.
// the match's Score. The best match will be first.
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
input = t.sanitize(input)
tokens := Tokenize(input, t.maxNgram)
@ -91,23 +91,22 @@ func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
return ruleLess(rules[j], rules[i])
})
// Update rule stats.
if len(rules) > 0 {
rules[0].FirstCount++
for _, r := range rules {
r.MatchCount++
}
}
return rules
}
type Match struct {
Tag string
Tag string
// Confidence is used to sort all matches, and is normalized so the sum of
// Confidence values for all matches is 1. Confidence is relative to the
// number of matches and the size of matches in terms of number of tokens.
Confidence float64 // In the range (0,1].
}
// Return a list of matches with confidence.
// Return a list of matches with confidence. This is useful if you'd like to
// find the best matching rule out of all the matched rules.
//
// If you just want to find all matching rules, then use MatchRules.
func (t *RuleSet) Match(input string) []Match {
rules := t.MatchRules(input)
if len(rules) == 0 {
@ -151,8 +150,8 @@ func (t *RuleSet) Match(input string) []Match {
matches[rule.Tag] = idx
out = append(out, Match{Tag: rule.Tag})
}
out[idx].Confidence += float64(rule.score)
sum += float64(rule.score)
out[idx].Confidence += float64(rule.Score)
sum += float64(rule.Score)
}
for i := range out {
@ -161,21 +160,3 @@ func (t *RuleSet) Match(input string) []Match {
return out
}
// ListRules returns rules used in the ruleset sorted by the rules'
// FirstCount. This is the number of times the given rule was the best match to
// an input.
func (t *RuleSet) ListRules() []*Rule {
sort.Slice(t.rules, func(i, j int) bool {
if t.rules[j].FirstCount != t.rules[i].FirstCount {
return t.rules[j].FirstCount < t.rules[i].FirstCount
}
if t.rules[j].MatchCount != t.rules[i].MatchCount {
return t.rules[j].MatchCount < t.rules[i].MatchCount
}
return t.rules[j].Tag < t.rules[i].Tag
})
return t.rules
}

View File

@ -2,124 +2,19 @@ package tagengine
import (
"strings"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
"git.crumpington.com/public/tagengine/sanitize"
)
func newSanitizer() func(...string) string {
diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
return func(l ...string) string {
s := strings.Join(l, " ")
// Lowercase.
s = strings.ToLower(s)
// Remove apostrophes.
s = strings.ReplaceAll(s, "ß", "ss")
s = strings.ReplaceAll(s, "'s", "s")
s = strings.ReplaceAll(s, "`s", "s")
s = strings.ReplaceAll(s, "´s", "s")
// Remove diacritics.
if out, _, err := transform.String(diactricsFix, s); err == nil {
s = out
}
// Clean spaces.
s = spaceNumbers(s)
s = addSpaces(s)
s = collapseSpaces(s)
return s
}
}
func spaceNumbers(s string) string {
if len(s) == 0 {
return s
}
isDigit := func(b rune) bool {
switch b {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
}
return false
}
b := strings.Builder{}
var first rune
for _, c := range s {
first = c
break
}
digit := isDigit(first)
// Range over runes.
for _, c := range s {
thisDigit := isDigit(c)
if thisDigit != digit {
b.WriteByte(' ')
digit = thisDigit
}
b.WriteRune(c)
}
return b.String()
}
func addSpaces(s string) string {
needsSpace := func(r rune) bool {
switch r {
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
return true
}
return false
}
b := strings.Builder{}
// Range over runes.
for _, r := range s {
if needsSpace(r) {
b.WriteRune(' ')
b.WriteRune(r)
b.WriteRune(' ')
} else {
b.WriteRune(r)
}
}
return b.String()
}
func collapseSpaces(s string) string {
// Trim leading and trailing spaces.
s = strings.TrimSpace(s)
b := strings.Builder{}
wasSpace := false
// Range over runes.
for _, c := range s {
if unicode.IsSpace(c) {
wasSpace = true
continue
} else if wasSpace {
wasSpace = false
b.WriteRune(' ')
}
b.WriteRune(c)
}
return b.String()
// The basic sanitizer:
// * lower-case
// * put spaces around numbers
// * put slaces around punctuation
// * collapse multiple spaces
func BasicSanitizer(s string) string {
s = strings.ToLower(s)
s = sanitize.SpaceNumbers(s)
s = sanitize.SpacePunctuation(s)
s = sanitize.CollapseSpaces(s)
return s
}

91
sanitize/sanitize.go Normal file
View File

@ -0,0 +1,91 @@
package sanitize
import (
"strings"
"unicode"
)
func SpaceNumbers(s string) string {
if len(s) == 0 {
return s
}
isDigit := func(b rune) bool {
switch b {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
}
return false
}
b := strings.Builder{}
var first rune
for _, c := range s {
first = c
break
}
digit := isDigit(first)
// Range over runes.
for _, c := range s {
thisDigit := isDigit(c)
if thisDigit != digit {
b.WriteByte(' ')
digit = thisDigit
}
b.WriteRune(c)
}
return b.String()
}
func SpacePunctuation(s string) string {
needsSpace := func(r rune) bool {
switch r {
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
return true
}
return false
}
b := strings.Builder{}
// Range over runes.
for _, r := range s {
if needsSpace(r) {
b.WriteRune(' ')
b.WriteRune(r)
b.WriteRune(' ')
} else {
b.WriteRune(r)
}
}
return b.String()
}
func CollapseSpaces(s string) string {
// Trim leading and trailing spaces.
s = strings.TrimSpace(s)
b := strings.Builder{}
wasSpace := false
// Range over runes.
for _, c := range s {
if unicode.IsSpace(c) {
wasSpace = true
continue
} else if wasSpace {
wasSpace = false
b.WriteRune(' ')
}
b.WriteRune(c)
}
return b.String()
}

View File

@ -3,7 +3,7 @@ package tagengine
import "testing"
func TestSanitize(t *testing.T) {
sanitize := newSanitizer()
sanitize := BasicSanitizer
type Case struct {
In string
@ -17,12 +17,7 @@ func TestSanitize(t *testing.T) {
{"abc123xyz", "abc 123 xyz"},
{"1f2", "1 f 2"},
{" abc", "abc"},
{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
{" Pott`s gin königs beer;SOJU ", "potts gin konigs beer ; soju"},
{"brot & brötchen", "brot & brotchen"},
{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
{"€", "€"},
}