parent
0a77a882f1
commit
f4927aaed4
4
go.mod
4
go.mod
|
@ -1,5 +1,3 @@
|
||||||
module git.crumpington.com/public/tagengine
|
module git.crumpington.com/public/tagengine
|
||||||
|
|
||||||
go 1.17
|
go 1.21.1
|
||||||
|
|
||||||
require golang.org/x/text v0.3.7
|
|
||||||
|
|
3
go.sum
3
go.sum
|
@ -1,3 +0,0 @@
|
||||||
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
|
|
||||||
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
|
|
||||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
|
2
node.go
2
node.go
|
@ -7,7 +7,7 @@ import (
|
||||||
|
|
||||||
type node struct {
|
type node struct {
|
||||||
Token string
|
Token string
|
||||||
Matches []*Rule
|
Matches []*Rule // If a list of tokens reaches this node, it matches these.
|
||||||
Children map[string]*node
|
Children map[string]*node
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
36
rule.go
36
rule.go
|
@ -1,15 +1,25 @@
|
||||||
package tagengine
|
package tagengine
|
||||||
|
|
||||||
type Rule struct {
|
type Rule struct {
|
||||||
Tag string
|
// The purpose of a Rule is to attach it's Tag to matching text.
|
||||||
|
Tag string
|
||||||
|
|
||||||
|
// Includes is a list of strings that must be found in the input in order to
|
||||||
|
// match.
|
||||||
Includes []string
|
Includes []string
|
||||||
|
|
||||||
|
// Excludes is a list of strings that can exclude a match for this rule.
|
||||||
Excludes []string
|
Excludes []string
|
||||||
Blocks []string // List of blocked tags.
|
|
||||||
|
|
||||||
MatchCount int
|
// Blocks: If this rule is matched, then it will block matches of any tags
|
||||||
FirstCount int
|
// listed here.
|
||||||
|
Blocks []string
|
||||||
|
|
||||||
score int
|
// The Score encodes the complexity of the Rule. A higher score indicates a
|
||||||
|
// more specific match. A Rule more includes, or includes with multiple words
|
||||||
|
// should havee a higher Score than a Rule with fewer includes or less
|
||||||
|
// complex includes.
|
||||||
|
Score int
|
||||||
|
|
||||||
excludes map[string]struct{}
|
excludes map[string]struct{}
|
||||||
}
|
}
|
||||||
|
@ -21,7 +31,7 @@ func NewRule(tag string) Rule {
|
||||||
func (r Rule) Inc(l ...string) Rule {
|
func (r Rule) Inc(l ...string) Rule {
|
||||||
return Rule{
|
return Rule{
|
||||||
Tag: r.Tag,
|
Tag: r.Tag,
|
||||||
Includes: l,
|
Includes: append(r.Includes, l...),
|
||||||
Excludes: r.Excludes,
|
Excludes: r.Excludes,
|
||||||
Blocks: r.Blocks,
|
Blocks: r.Blocks,
|
||||||
}
|
}
|
||||||
|
@ -31,7 +41,7 @@ func (r Rule) Exc(l ...string) Rule {
|
||||||
return Rule{
|
return Rule{
|
||||||
Tag: r.Tag,
|
Tag: r.Tag,
|
||||||
Includes: r.Includes,
|
Includes: r.Includes,
|
||||||
Excludes: l,
|
Excludes: append(r.Excludes, l...),
|
||||||
Blocks: r.Blocks,
|
Blocks: r.Blocks,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,13 +51,11 @@ func (r Rule) Block(l ...string) Rule {
|
||||||
Tag: r.Tag,
|
Tag: r.Tag,
|
||||||
Includes: r.Includes,
|
Includes: r.Includes,
|
||||||
Excludes: r.Excludes,
|
Excludes: r.Excludes,
|
||||||
Blocks: l,
|
Blocks: append(r.Blocks, l...),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rule *Rule) normalize() {
|
func (rule *Rule) normalize(sanitize func(string) string) {
|
||||||
sanitize := newSanitizer()
|
|
||||||
|
|
||||||
for i, token := range rule.Includes {
|
for i, token := range rule.Includes {
|
||||||
rule.Includes[i] = sanitize(token)
|
rule.Includes[i] = sanitize(token)
|
||||||
}
|
}
|
||||||
|
@ -63,7 +71,7 @@ func (rule *Rule) normalize() {
|
||||||
rule.excludes[s] = struct{}{}
|
rule.excludes[s] = struct{}{}
|
||||||
}
|
}
|
||||||
|
|
||||||
rule.score = rule.computeScore()
|
rule.Score = rule.computeScore()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r Rule) maxNGram() int {
|
func (r Rule) maxNGram() int {
|
||||||
|
@ -108,8 +116,8 @@ func (r Rule) computeScore() (score int) {
|
||||||
|
|
||||||
func ruleLess(lhs, rhs *Rule) bool {
|
func ruleLess(lhs, rhs *Rule) bool {
|
||||||
// If scores differ, sort by score.
|
// If scores differ, sort by score.
|
||||||
if lhs.score != rhs.score {
|
if lhs.Score != rhs.Score {
|
||||||
return lhs.score < rhs.score
|
return lhs.Score < rhs.Score
|
||||||
}
|
}
|
||||||
|
|
||||||
// If include depth differs, sort by depth.
|
// If include depth differs, sort by depth.
|
||||||
|
|
15
rulegroup.go
15
rulegroup.go
|
@ -1,7 +1,7 @@
|
||||||
package tagengine
|
package tagengine
|
||||||
|
|
||||||
// A RuleGroup can be converted into a list of rules. Each rule will point to
|
// A RuleGroup can be converted into a list of rules. Each rule will point to
|
||||||
// the same tag, and have the same exclude set.
|
// the same tag, and have the same exclude set and blocks.
|
||||||
type RuleGroup struct {
|
type RuleGroup struct {
|
||||||
Tag string
|
Tag string
|
||||||
Includes [][]string
|
Includes [][]string
|
||||||
|
@ -31,7 +31,7 @@ func (g RuleGroup) Exc(l ...string) RuleGroup {
|
||||||
return RuleGroup{
|
return RuleGroup{
|
||||||
Tag: g.Tag,
|
Tag: g.Tag,
|
||||||
Includes: g.Includes,
|
Includes: g.Includes,
|
||||||
Excludes: l,
|
Excludes: append(g.Excludes, l...),
|
||||||
Blocks: g.Blocks,
|
Blocks: g.Blocks,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -41,16 +41,17 @@ func (g RuleGroup) Block(l ...string) RuleGroup {
|
||||||
Tag: g.Tag,
|
Tag: g.Tag,
|
||||||
Includes: g.Includes,
|
Includes: g.Includes,
|
||||||
Excludes: g.Excludes,
|
Excludes: g.Excludes,
|
||||||
Blocks: l,
|
Blocks: append(g.Blocks, l...),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (rg RuleGroup) ToList() (l []Rule) {
|
func (g RuleGroup) ToList() (l []Rule) {
|
||||||
for _, includes := range rg.Includes {
|
for _, includes := range g.Includes {
|
||||||
l = append(l, Rule{
|
l = append(l, Rule{
|
||||||
Tag: rg.Tag,
|
Tag: g.Tag,
|
||||||
Excludes: rg.Excludes,
|
Excludes: g.Excludes,
|
||||||
Includes: includes,
|
Includes: includes,
|
||||||
|
Blocks: g.Blocks,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
return
|
return
|
||||||
|
|
49
ruleset.go
49
ruleset.go
|
@ -7,7 +7,7 @@ import (
|
||||||
type RuleSet struct {
|
type RuleSet struct {
|
||||||
root *node
|
root *node
|
||||||
maxNgram int
|
maxNgram int
|
||||||
sanitize func(...string) string
|
sanitize func(string) string
|
||||||
rules []*Rule
|
rules []*Rule
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -17,7 +17,7 @@ func NewRuleSet() *RuleSet {
|
||||||
Token: "/",
|
Token: "/",
|
||||||
Children: map[string]*node{},
|
Children: map[string]*node{},
|
||||||
},
|
},
|
||||||
sanitize: newSanitizer(),
|
sanitize: BasicSanitizer,
|
||||||
rules: []*Rule{},
|
rules: []*Rule{},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -46,7 +46,7 @@ func (t *RuleSet) AddRule(rules ...Rule) {
|
||||||
rule := rule
|
rule := rule
|
||||||
|
|
||||||
// Make sure rule is well-formed.
|
// Make sure rule is well-formed.
|
||||||
rule.normalize()
|
rule.normalize(t.sanitize)
|
||||||
|
|
||||||
// Update maxNgram.
|
// Update maxNgram.
|
||||||
N := rule.maxNGram()
|
N := rule.maxNGram()
|
||||||
|
@ -66,7 +66,7 @@ func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// MatchRules will return a list of all matching rules. The rules are sorted by
|
// MatchRules will return a list of all matching rules. The rules are sorted by
|
||||||
// the match's "score". The best match will be first.
|
// the match's Score. The best match will be first.
|
||||||
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
||||||
input = t.sanitize(input)
|
input = t.sanitize(input)
|
||||||
tokens := Tokenize(input, t.maxNgram)
|
tokens := Tokenize(input, t.maxNgram)
|
||||||
|
@ -91,23 +91,22 @@ func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
|
||||||
return ruleLess(rules[j], rules[i])
|
return ruleLess(rules[j], rules[i])
|
||||||
})
|
})
|
||||||
|
|
||||||
// Update rule stats.
|
|
||||||
if len(rules) > 0 {
|
|
||||||
rules[0].FirstCount++
|
|
||||||
for _, r := range rules {
|
|
||||||
r.MatchCount++
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return rules
|
return rules
|
||||||
}
|
}
|
||||||
|
|
||||||
type Match struct {
|
type Match struct {
|
||||||
Tag string
|
Tag string
|
||||||
|
|
||||||
|
// Confidence is used to sort all matches, and is normalized so the sum of
|
||||||
|
// Confidence values for all matches is 1. Confidence is relative to the
|
||||||
|
// number of matches and the size of matches in terms of number of tokens.
|
||||||
Confidence float64 // In the range (0,1].
|
Confidence float64 // In the range (0,1].
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return a list of matches with confidence.
|
// Return a list of matches with confidence. This is useful if you'd like to
|
||||||
|
// find the best matching rule out of all the matched rules.
|
||||||
|
//
|
||||||
|
// If you just want to find all matching rules, then use MatchRules.
|
||||||
func (t *RuleSet) Match(input string) []Match {
|
func (t *RuleSet) Match(input string) []Match {
|
||||||
rules := t.MatchRules(input)
|
rules := t.MatchRules(input)
|
||||||
if len(rules) == 0 {
|
if len(rules) == 0 {
|
||||||
|
@ -151,8 +150,8 @@ func (t *RuleSet) Match(input string) []Match {
|
||||||
matches[rule.Tag] = idx
|
matches[rule.Tag] = idx
|
||||||
out = append(out, Match{Tag: rule.Tag})
|
out = append(out, Match{Tag: rule.Tag})
|
||||||
}
|
}
|
||||||
out[idx].Confidence += float64(rule.score)
|
out[idx].Confidence += float64(rule.Score)
|
||||||
sum += float64(rule.score)
|
sum += float64(rule.Score)
|
||||||
}
|
}
|
||||||
|
|
||||||
for i := range out {
|
for i := range out {
|
||||||
|
@ -161,21 +160,3 @@ func (t *RuleSet) Match(input string) []Match {
|
||||||
|
|
||||||
return out
|
return out
|
||||||
}
|
}
|
||||||
|
|
||||||
// ListRules returns rules used in the ruleset sorted by the rules'
|
|
||||||
// FirstCount. This is the number of times the given rule was the best match to
|
|
||||||
// an input.
|
|
||||||
func (t *RuleSet) ListRules() []*Rule {
|
|
||||||
sort.Slice(t.rules, func(i, j int) bool {
|
|
||||||
if t.rules[j].FirstCount != t.rules[i].FirstCount {
|
|
||||||
return t.rules[j].FirstCount < t.rules[i].FirstCount
|
|
||||||
}
|
|
||||||
|
|
||||||
if t.rules[j].MatchCount != t.rules[i].MatchCount {
|
|
||||||
return t.rules[j].MatchCount < t.rules[i].MatchCount
|
|
||||||
}
|
|
||||||
|
|
||||||
return t.rules[j].Tag < t.rules[i].Tag
|
|
||||||
})
|
|
||||||
return t.rules
|
|
||||||
}
|
|
||||||
|
|
129
sanitize.go
129
sanitize.go
|
@ -2,124 +2,19 @@ package tagengine
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"strings"
|
"strings"
|
||||||
"unicode"
|
|
||||||
|
|
||||||
"golang.org/x/text/runes"
|
"git.crumpington.com/public/tagengine/sanitize"
|
||||||
"golang.org/x/text/transform"
|
|
||||||
"golang.org/x/text/unicode/norm"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
func newSanitizer() func(...string) string {
|
// The basic sanitizer:
|
||||||
diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
|
// * lower-case
|
||||||
|
// * put spaces around numbers
|
||||||
return func(l ...string) string {
|
// * put slaces around punctuation
|
||||||
|
// * collapse multiple spaces
|
||||||
s := strings.Join(l, " ")
|
func BasicSanitizer(s string) string {
|
||||||
|
s = strings.ToLower(s)
|
||||||
// Lowercase.
|
s = sanitize.SpaceNumbers(s)
|
||||||
s = strings.ToLower(s)
|
s = sanitize.SpacePunctuation(s)
|
||||||
|
s = sanitize.CollapseSpaces(s)
|
||||||
// Remove apostrophes.
|
return s
|
||||||
s = strings.ReplaceAll(s, "ß", "ss")
|
|
||||||
s = strings.ReplaceAll(s, "'s", "s")
|
|
||||||
s = strings.ReplaceAll(s, "`s", "s")
|
|
||||||
s = strings.ReplaceAll(s, "´s", "s")
|
|
||||||
|
|
||||||
// Remove diacritics.
|
|
||||||
if out, _, err := transform.String(diactricsFix, s); err == nil {
|
|
||||||
s = out
|
|
||||||
}
|
|
||||||
|
|
||||||
// Clean spaces.
|
|
||||||
s = spaceNumbers(s)
|
|
||||||
s = addSpaces(s)
|
|
||||||
s = collapseSpaces(s)
|
|
||||||
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
func spaceNumbers(s string) string {
|
|
||||||
if len(s) == 0 {
|
|
||||||
return s
|
|
||||||
}
|
|
||||||
|
|
||||||
isDigit := func(b rune) bool {
|
|
||||||
switch b {
|
|
||||||
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
b := strings.Builder{}
|
|
||||||
|
|
||||||
var first rune
|
|
||||||
for _, c := range s {
|
|
||||||
first = c
|
|
||||||
break
|
|
||||||
}
|
|
||||||
|
|
||||||
digit := isDigit(first)
|
|
||||||
|
|
||||||
// Range over runes.
|
|
||||||
for _, c := range s {
|
|
||||||
thisDigit := isDigit(c)
|
|
||||||
if thisDigit != digit {
|
|
||||||
b.WriteByte(' ')
|
|
||||||
digit = thisDigit
|
|
||||||
}
|
|
||||||
b.WriteRune(c)
|
|
||||||
}
|
|
||||||
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func addSpaces(s string) string {
|
|
||||||
needsSpace := func(r rune) bool {
|
|
||||||
switch r {
|
|
||||||
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
|
|
||||||
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
|
|
||||||
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
|
|
||||||
b := strings.Builder{}
|
|
||||||
|
|
||||||
// Range over runes.
|
|
||||||
for _, r := range s {
|
|
||||||
if needsSpace(r) {
|
|
||||||
b.WriteRune(' ')
|
|
||||||
b.WriteRune(r)
|
|
||||||
b.WriteRune(' ')
|
|
||||||
} else {
|
|
||||||
b.WriteRune(r)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return b.String()
|
|
||||||
}
|
|
||||||
|
|
||||||
func collapseSpaces(s string) string {
|
|
||||||
// Trim leading and trailing spaces.
|
|
||||||
s = strings.TrimSpace(s)
|
|
||||||
|
|
||||||
b := strings.Builder{}
|
|
||||||
wasSpace := false
|
|
||||||
|
|
||||||
// Range over runes.
|
|
||||||
for _, c := range s {
|
|
||||||
if unicode.IsSpace(c) {
|
|
||||||
wasSpace = true
|
|
||||||
continue
|
|
||||||
} else if wasSpace {
|
|
||||||
wasSpace = false
|
|
||||||
b.WriteRune(' ')
|
|
||||||
}
|
|
||||||
b.WriteRune(c)
|
|
||||||
}
|
|
||||||
|
|
||||||
return b.String()
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,91 @@
|
||||||
|
package sanitize
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"unicode"
|
||||||
|
)
|
||||||
|
|
||||||
|
func SpaceNumbers(s string) string {
|
||||||
|
if len(s) == 0 {
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
isDigit := func(b rune) bool {
|
||||||
|
switch b {
|
||||||
|
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
b := strings.Builder{}
|
||||||
|
|
||||||
|
var first rune
|
||||||
|
for _, c := range s {
|
||||||
|
first = c
|
||||||
|
break
|
||||||
|
}
|
||||||
|
|
||||||
|
digit := isDigit(first)
|
||||||
|
|
||||||
|
// Range over runes.
|
||||||
|
for _, c := range s {
|
||||||
|
thisDigit := isDigit(c)
|
||||||
|
if thisDigit != digit {
|
||||||
|
b.WriteByte(' ')
|
||||||
|
digit = thisDigit
|
||||||
|
}
|
||||||
|
b.WriteRune(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func SpacePunctuation(s string) string {
|
||||||
|
needsSpace := func(r rune) bool {
|
||||||
|
switch r {
|
||||||
|
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
|
||||||
|
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
|
||||||
|
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
|
||||||
|
return true
|
||||||
|
}
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
|
||||||
|
b := strings.Builder{}
|
||||||
|
|
||||||
|
// Range over runes.
|
||||||
|
for _, r := range s {
|
||||||
|
if needsSpace(r) {
|
||||||
|
b.WriteRune(' ')
|
||||||
|
b.WriteRune(r)
|
||||||
|
b.WriteRune(' ')
|
||||||
|
} else {
|
||||||
|
b.WriteRune(r)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func CollapseSpaces(s string) string {
|
||||||
|
// Trim leading and trailing spaces.
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
|
||||||
|
b := strings.Builder{}
|
||||||
|
wasSpace := false
|
||||||
|
|
||||||
|
// Range over runes.
|
||||||
|
for _, c := range s {
|
||||||
|
if unicode.IsSpace(c) {
|
||||||
|
wasSpace = true
|
||||||
|
continue
|
||||||
|
} else if wasSpace {
|
||||||
|
wasSpace = false
|
||||||
|
b.WriteRune(' ')
|
||||||
|
}
|
||||||
|
b.WriteRune(c)
|
||||||
|
}
|
||||||
|
|
||||||
|
return b.String()
|
||||||
|
}
|
|
@ -3,7 +3,7 @@ package tagengine
|
||||||
import "testing"
|
import "testing"
|
||||||
|
|
||||||
func TestSanitize(t *testing.T) {
|
func TestSanitize(t *testing.T) {
|
||||||
sanitize := newSanitizer()
|
sanitize := BasicSanitizer
|
||||||
|
|
||||||
type Case struct {
|
type Case struct {
|
||||||
In string
|
In string
|
||||||
|
@ -17,12 +17,7 @@ func TestSanitize(t *testing.T) {
|
||||||
{"abc123xyz", "abc 123 xyz"},
|
{"abc123xyz", "abc 123 xyz"},
|
||||||
{"1f2", "1 f 2"},
|
{"1f2", "1 f 2"},
|
||||||
{" abc", "abc"},
|
{" abc", "abc"},
|
||||||
{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
|
{" ; KitKat/m&m's (bottle) @ ", "; kitkat / m & m ' s ( bottle ) @"},
|
||||||
{" Pott`s gin königs beer;SOJU ", "potts gin konigs beer ; soju"},
|
|
||||||
{"brot & brötchen", "brot & brotchen"},
|
|
||||||
{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
|
|
||||||
{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
|
|
||||||
{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
|
|
||||||
{"€", "€"},
|
{"€", "€"},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue