parent
0a77a882f1
commit
f4927aaed4
4
go.mod
4
go.mod
|
@ -1,5 +1,3 @@
|
|||
module git.crumpington.com/public/tagengine
|
||||
|
||||
go 1.17
|
||||
|
||||
require golang.org/x/text v0.3.7
|
||||
go 1.21.1
|
||||
|
|
3
go.sum
3
go.sum
|
@ -1,3 +0,0 @@
|
|||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
2
node.go
2
node.go
|
@ -7,7 +7,7 @@ import (
|
|||
|
||||
type node struct {
|
||||
Token string
|
||||
Matches []*Rule
|
||||
Matches []*Rule // If a list of tokens reaches this node, it matches these.
|
||||
Children map[string]*node
|
||||
}
|
||||
|
||||
|
|
34
rule.go
34
rule.go
|
@ -1,15 +1,25 @@
|
|||
package tagengine
|
||||
|
||||
type Rule struct {
|
||||
// The purpose of a Rule is to attach it's Tag to matching text.
|
||||
Tag string
|
||||
|
||||
// Includes is a list of strings that must be found in the input in order to
|
||||
// match.
|
||||
Includes []string
|
||||
|
||||
// Excludes is a list of strings that can exclude a match for this rule.
|
||||
Excludes []string
|
||||
Blocks []string // List of blocked tags.
|
||||
|
||||
MatchCount int
|
||||
FirstCount int
|
||||
// Blocks: If this rule is matched, then it will block matches of any tags
|
||||
// listed here.
|
||||
Blocks []string
|
||||
|
||||
score int
|
||||
// The Score encodes the complexity of the Rule. A higher score indicates a
|
||||
// more specific match. A Rule more includes, or includes with multiple words
|
||||
// should havee a higher Score than a Rule with fewer includes or less
|
||||
// complex includes.
|
||||
Score int
|
||||
|
||||
excludes map[string]struct{}
|
||||
}
|
||||
|
@ -21,7 +31,7 @@ func NewRule(tag string) Rule {
|
|||
func (r Rule) Inc(l ...string) Rule {
|
||||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: l,
|
||||
Includes: append(r.Includes, l...),
|
||||
Excludes: r.Excludes,
|
||||
Blocks: r.Blocks,
|
||||
}
|
||||
|
@ -31,7 +41,7 @@ func (r Rule) Exc(l ...string) Rule {
|
|||
return Rule{
|
||||
Tag: r.Tag,
|
||||
Includes: r.Includes,
|
||||
Excludes: l,
|
||||
Excludes: append(r.Excludes, l...),
|
||||
Blocks: r.Blocks,
|
||||
}
|
||||
}
|
||||
|
@ -41,13 +51,11 @@ func (r Rule) Block(l ...string) Rule {
|
|||
Tag: r.Tag,
|
||||
Includes: r.Includes,
|
||||
Excludes: r.Excludes,
|
||||
Blocks: l,
|
||||
Blocks: append(r.Blocks, l...),
|
||||
}
|
||||
}
|
||||
|
||||
func (rule *Rule) normalize() {
|
||||
sanitize := newSanitizer()
|
||||
|
||||
func (rule *Rule) normalize(sanitize func(string) string) {
|
||||
for i, token := range rule.Includes {
|
||||
rule.Includes[i] = sanitize(token)
|
||||
}
|
||||
|
@ -63,7 +71,7 @@ func (rule *Rule) normalize() {
|
|||
rule.excludes[s] = struct{}{}
|
||||
}
|
||||
|
||||
rule.score = rule.computeScore()
|
||||
rule.Score = rule.computeScore()
|
||||
}
|
||||
|
||||
func (r Rule) maxNGram() int {
|
||||
|
@ -108,8 +116,8 @@ func (r Rule) computeScore() (score int) {
|
|||
|
||||
func ruleLess(lhs, rhs *Rule) bool {
|
||||
// If scores differ, sort by score.
|
||||
if lhs.score != rhs.score {
|
||||
return lhs.score < rhs.score
|
||||
if lhs.Score != rhs.Score {
|
||||
return lhs.Score < rhs.Score
|
||||
}
|
||||
|
||||
// If include depth differs, sort by depth.
|
||||
|
|
15
rulegroup.go
15
rulegroup.go
|
@ -1,7 +1,7 @@
|
|||
package tagengine
|
||||
|
||||
// A RuleGroup can be converted into a list of rules. Each rule will point to
|
||||
// the same tag, and have the same exclude set.
|
||||
// the same tag, and have the same exclude set and blocks.
|
||||
type RuleGroup struct {
|
||||
Tag string
|
||||
Includes [][]string
|
||||
|
@ -31,7 +31,7 @@ func (g RuleGroup) Exc(l ...string) RuleGroup {
|
|||
return RuleGroup{
|
||||
Tag: g.Tag,
|
||||
Includes: g.Includes,
|
||||
Excludes: l,
|
||||
Excludes: append(g.Excludes, l...),
|
||||
Blocks: g.Blocks,
|
||||
}
|
||||
}
|
||||
|
@ -41,16 +41,17 @@ func (g RuleGroup) Block(l ...string) RuleGroup {
|
|||
Tag: g.Tag,
|
||||
Includes: g.Includes,
|
||||
Excludes: g.Excludes,
|
||||
Blocks: l,
|
||||
Blocks: append(g.Blocks, l...),
|
||||
}
|
||||
}
|
||||
|
||||
func (rg RuleGroup) ToList() (l []Rule) {
|
||||
for _, includes := range rg.Includes {
|
||||
func (g RuleGroup) ToList() (l []Rule) {
|
||||
for _, includes := range g.Includes {
|
||||
l = append(l, Rule{
|
||||
Tag: rg.Tag,
|
||||
Excludes: rg.Excludes,
|
||||
Tag: g.Tag,
|
||||
Excludes: g.Excludes,
|
||||
Includes: includes,
|
||||
Blocks: g.Blocks,
|
||||
})
|
||||
}
|
||||
return
|
||||
|
|
47
ruleset.go
47
ruleset.go
|
@ -7,7 +7,7 @@ import (
|
|||
type RuleSet struct {
|
||||
root *node
|
||||
maxNgram int
|
||||
sanitize func(...string) string
|
||||
sanitize func(string) string
|
||||
rules []*Rule
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,7 @@ func NewRuleSet() *RuleSet {
|
|||
Token: "/",
|
||||
Children: map[string]*node{},
|
||||
},
|
||||
sanitize: newSanitizer(),
|
||||
sanitize: BasicSanitizer,
|
||||
rules: []*Rule{},
|
||||
}
|
||||
}
|
||||
|
@ -46,7 +46,7 @@ func (t *RuleSet) AddRule(rules ...Rule) {
|
|||
rule := rule
|
||||
|
||||
// Make sure rule is well-formed.
|
||||
rule.normalize()
|
||||
rule.normalize(t.sanitize)
|
||||
|
||||
// Update maxNgram.
|
||||
N := rule.maxNGram()
|
||||
|
@ -66,7 +66,7 @@ func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
|
|||
}
|
||||
|
||||
// MatchRules will return a list of all matching rules. The rules are sorted by
|
||||
// the match's "score". The best match will be first.
|
||||
// the match's Score. The best match will be first.
|
||||
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
||||
input = t.sanitize(input)
|
||||
tokens := Tokenize(input, t.maxNgram)
|
||||
|
@ -91,23 +91,22 @@ func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
|||
return ruleLess(rules[j], rules[i])
|
||||
})
|
||||
|
||||
// Update rule stats.
|
||||
if len(rules) > 0 {
|
||||
rules[0].FirstCount++
|
||||
for _, r := range rules {
|
||||
r.MatchCount++
|
||||
}
|
||||
}
|
||||
|
||||
return rules
|
||||
}
|
||||
|
||||
type Match struct {
|
||||
Tag string
|
||||
|
||||
// Confidence is used to sort all matches, and is normalized so the sum of
|
||||
// Confidence values for all matches is 1. Confidence is relative to the
|
||||
// number of matches and the size of matches in terms of number of tokens.
|
||||
Confidence float64 // In the range (0,1].
|
||||
}
|
||||
|
||||
// Return a list of matches with confidence.
|
||||
// Return a list of matches with confidence. This is useful if you'd like to
|
||||
// find the best matching rule out of all the matched rules.
|
||||
//
|
||||
// If you just want to find all matching rules, then use MatchRules.
|
||||
func (t *RuleSet) Match(input string) []Match {
|
||||
rules := t.MatchRules(input)
|
||||
if len(rules) == 0 {
|
||||
|
@ -151,8 +150,8 @@ func (t *RuleSet) Match(input string) []Match {
|
|||
matches[rule.Tag] = idx
|
||||
out = append(out, Match{Tag: rule.Tag})
|
||||
}
|
||||
out[idx].Confidence += float64(rule.score)
|
||||
sum += float64(rule.score)
|
||||
out[idx].Confidence += float64(rule.Score)
|
||||
sum += float64(rule.Score)
|
||||
}
|
||||
|
||||
for i := range out {
|
||||
|
@ -161,21 +160,3 @@ func (t *RuleSet) Match(input string) []Match {
|
|||
|
||||
return out
|
||||
}
|
||||
|
||||
// ListRules returns rules used in the ruleset sorted by the rules'
|
||||
// FirstCount. This is the number of times the given rule was the best match to
|
||||
// an input.
|
||||
func (t *RuleSet) ListRules() []*Rule {
|
||||
sort.Slice(t.rules, func(i, j int) bool {
|
||||
if t.rules[j].FirstCount != t.rules[i].FirstCount {
|
||||
return t.rules[j].FirstCount < t.rules[i].FirstCount
|
||||
}
|
||||
|
||||
if t.rules[j].MatchCount != t.rules[i].MatchCount {
|
||||
return t.rules[j].MatchCount < t.rules[i].MatchCount
|
||||
}
|
||||
|
||||
return t.rules[j].Tag < t.rules[i].Tag
|
||||
})
|
||||
return t.rules
|
||||
}
|
||||
|
|
125
sanitize.go
125
sanitize.go
|
@ -2,124 +2,19 @@ package tagengine
|
|||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
|
||||
"golang.org/x/text/runes"
|
||||
"golang.org/x/text/transform"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
"git.crumpington.com/public/tagengine/sanitize"
|
||||
)
|
||||
|
||||
func newSanitizer() func(...string) string {
|
||||
diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
|
||||
|
||||
return func(l ...string) string {
|
||||
|
||||
s := strings.Join(l, " ")
|
||||
|
||||
// Lowercase.
|
||||
// The basic sanitizer:
|
||||
// * lower-case
|
||||
// * put spaces around numbers
|
||||
// * put slaces around punctuation
|
||||
// * collapse multiple spaces
|
||||
func BasicSanitizer(s string) string {
|
||||
s = strings.ToLower(s)
|
||||
|
||||
// Remove apostrophes.
|
||||
s = strings.ReplaceAll(s, "ß", "ss")
|
||||
s = strings.ReplaceAll(s, "'s", "s")
|
||||
s = strings.ReplaceAll(s, "`s", "s")
|
||||
s = strings.ReplaceAll(s, "´s", "s")
|
||||
|
||||
// Remove diacritics.
|
||||
if out, _, err := transform.String(diactricsFix, s); err == nil {
|
||||
s = out
|
||||
}
|
||||
|
||||
// Clean spaces.
|
||||
s = spaceNumbers(s)
|
||||
s = addSpaces(s)
|
||||
s = collapseSpaces(s)
|
||||
|
||||
s = sanitize.SpaceNumbers(s)
|
||||
s = sanitize.SpacePunctuation(s)
|
||||
s = sanitize.CollapseSpaces(s)
|
||||
return s
|
||||
}
|
||||
}
|
||||
|
||||
func spaceNumbers(s string) string {
|
||||
if len(s) == 0 {
|
||||
return s
|
||||
}
|
||||
|
||||
isDigit := func(b rune) bool {
|
||||
switch b {
|
||||
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
b := strings.Builder{}
|
||||
|
||||
var first rune
|
||||
for _, c := range s {
|
||||
first = c
|
||||
break
|
||||
}
|
||||
|
||||
digit := isDigit(first)
|
||||
|
||||
// Range over runes.
|
||||
for _, c := range s {
|
||||
thisDigit := isDigit(c)
|
||||
if thisDigit != digit {
|
||||
b.WriteByte(' ')
|
||||
digit = thisDigit
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func addSpaces(s string) string {
|
||||
needsSpace := func(r rune) bool {
|
||||
switch r {
|
||||
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
|
||||
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
|
||||
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
b := strings.Builder{}
|
||||
|
||||
// Range over runes.
|
||||
for _, r := range s {
|
||||
if needsSpace(r) {
|
||||
b.WriteRune(' ')
|
||||
b.WriteRune(r)
|
||||
b.WriteRune(' ')
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func collapseSpaces(s string) string {
|
||||
// Trim leading and trailing spaces.
|
||||
s = strings.TrimSpace(s)
|
||||
|
||||
b := strings.Builder{}
|
||||
wasSpace := false
|
||||
|
||||
// Range over runes.
|
||||
for _, c := range s {
|
||||
if unicode.IsSpace(c) {
|
||||
wasSpace = true
|
||||
continue
|
||||
} else if wasSpace {
|
||||
wasSpace = false
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
package sanitize
|
||||
|
||||
import (
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
func SpaceNumbers(s string) string {
|
||||
if len(s) == 0 {
|
||||
return s
|
||||
}
|
||||
|
||||
isDigit := func(b rune) bool {
|
||||
switch b {
|
||||
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
b := strings.Builder{}
|
||||
|
||||
var first rune
|
||||
for _, c := range s {
|
||||
first = c
|
||||
break
|
||||
}
|
||||
|
||||
digit := isDigit(first)
|
||||
|
||||
// Range over runes.
|
||||
for _, c := range s {
|
||||
thisDigit := isDigit(c)
|
||||
if thisDigit != digit {
|
||||
b.WriteByte(' ')
|
||||
digit = thisDigit
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func SpacePunctuation(s string) string {
|
||||
needsSpace := func(r rune) bool {
|
||||
switch r {
|
||||
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
|
||||
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
|
||||
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
|
||||
return true
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
b := strings.Builder{}
|
||||
|
||||
// Range over runes.
|
||||
for _, r := range s {
|
||||
if needsSpace(r) {
|
||||
b.WriteRune(' ')
|
||||
b.WriteRune(r)
|
||||
b.WriteRune(' ')
|
||||
} else {
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
||||
|
||||
func CollapseSpaces(s string) string {
|
||||
// Trim leading and trailing spaces.
|
||||
s = strings.TrimSpace(s)
|
||||
|
||||
b := strings.Builder{}
|
||||
wasSpace := false
|
||||
|
||||
// Range over runes.
|
||||
for _, c := range s {
|
||||
if unicode.IsSpace(c) {
|
||||
wasSpace = true
|
||||
continue
|
||||
} else if wasSpace {
|
||||
wasSpace = false
|
||||
b.WriteRune(' ')
|
||||
}
|
||||
b.WriteRune(c)
|
||||
}
|
||||
|
||||
return b.String()
|
||||
}
|
|
@ -3,7 +3,7 @@ package tagengine
|
|||
import "testing"
|
||||
|
||||
func TestSanitize(t *testing.T) {
|
||||
sanitize := newSanitizer()
|
||||
sanitize := BasicSanitizer
|
||||
|
||||
type Case struct {
|
||||
In string
|
||||
|
@ -17,12 +17,7 @@ func TestSanitize(t *testing.T) {
|
|||
{"abc123xyz", "abc 123 xyz"},
|
||||
{"1f2", "1 f 2"},
|
||||
{" abc", "abc"},
|
||||
{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
|
||||
{" Pott`s gin königs beer;SOJU ", "potts gin konigs beer ; soju"},
|
||||
{"brot & brötchen", "brot & brotchen"},
|
||||
{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
|
||||
{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
|
||||
{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
|
||||
{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
|
||||
{"€", "€"},
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue