Forked from subrubia

master
jdl 2021-09-09 12:25:53 +02:00
parent d04606923b
commit 0a77a882f1
14 changed files with 920 additions and 0 deletions

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2020 Suburbia
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

5
go.mod Normal file
View File

@ -0,0 +1,5 @@
module git.crumpington.com/public/tagengine
go 1.17
require golang.org/x/text v0.3.7

3
go.sum Normal file
View File

@ -0,0 +1,3 @@
golang.org/x/text v0.3.7 h1:olpwvP2KacW1ZWvsR7uQhoyTYvKAupfQrRGBFM352Gk=
golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=

30
ngram.go Normal file
View File

@ -0,0 +1,30 @@
package tagengine
import "unicode"
func ngramLength(s string) int {
N := len(s)
i := 0
count := 0
for {
// Eat spaces.
for i < N && unicode.IsSpace(rune(s[i])) {
i++
}
// Done?
if i == N {
break
}
// Non-space!
count++
// Eat non-spaces.
for i < N && !unicode.IsSpace(rune(s[i])) {
i++
}
}
return count
}

31
ngram_test.go Normal file
View File

@ -0,0 +1,31 @@
package tagengine
import (
"log"
"testing"
)
func TestNGramLength(t *testing.T) {
type Case struct {
Input string
Length int
}
cases := []Case{
{"a b c", 3},
{" xyz\nlkj dflaj a", 4},
{"a", 1},
{" a", 1},
{"a", 1},
{" a\n", 1},
{" a ", 1},
{"\tx\ny\nz q ", 4},
}
for _, tc := range cases {
length := ngramLength(tc.Input)
if length != tc.Length {
log.Fatalf("%s: %d != %d", tc.Input, length, tc.Length)
}
}
}

79
node.go Normal file
View File

@ -0,0 +1,79 @@
package tagengine
import (
"fmt"
"strings"
)
type node struct {
Token string
Matches []*Rule
Children map[string]*node
}
func (n *node) AddRule(r *Rule) {
n.addRule(r, 0)
}
func (n *node) addRule(r *Rule, idx int) {
if len(r.Includes) == idx {
n.Matches = append(n.Matches, r)
return
}
token := r.Includes[idx]
child, ok := n.Children[token]
if !ok {
child = &node{
Token: token,
Children: map[string]*node{},
}
n.Children[token] = child
}
child.addRule(r, idx+1)
}
// Note that tokens must be sorted. This is the case for tokens created from
// the tokenize function.
func (n *node) Match(tokens []string) (rules []*Rule) {
return n.match(tokens, rules)
}
func (n *node) match(tokens []string, rules []*Rule) []*Rule {
// Check for a match.
if n.Matches != nil {
rules = append(rules, n.Matches...)
}
if len(tokens) == 0 {
return rules
}
// Attempt to match children.
for i := 0; i < len(tokens); i++ {
token := tokens[i]
if child, ok := n.Children[token]; ok {
rules = child.match(tokens[i+1:], rules)
}
}
return rules
}
func (n *node) Dump() {
n.dump(0)
}
func (n *node) dump(depth int) {
indent := strings.Repeat(" ", 2*depth)
tag := ""
for _, m := range n.Matches {
tag += " " + m.Tag
}
fmt.Printf("%s%s%s\n", indent, n.Token, tag)
for _, child := range n.Children {
child.dump(depth + 1)
}
}

151
rule.go Normal file
View File

@ -0,0 +1,151 @@
package tagengine
type Rule struct {
Tag string
Includes []string
Excludes []string
Blocks []string // List of blocked tags.
MatchCount int
FirstCount int
score int
excludes map[string]struct{}
}
func NewRule(tag string) Rule {
return Rule{Tag: tag}
}
func (r Rule) Inc(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: l,
Excludes: r.Excludes,
Blocks: r.Blocks,
}
}
func (r Rule) Exc(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: r.Includes,
Excludes: l,
Blocks: r.Blocks,
}
}
func (r Rule) Block(l ...string) Rule {
return Rule{
Tag: r.Tag,
Includes: r.Includes,
Excludes: r.Excludes,
Blocks: l,
}
}
func (rule *Rule) normalize() {
sanitize := newSanitizer()
for i, token := range rule.Includes {
rule.Includes[i] = sanitize(token)
}
for i, token := range rule.Excludes {
rule.Excludes[i] = sanitize(token)
}
sortTokens(rule.Includes)
sortTokens(rule.Excludes)
rule.excludes = map[string]struct{}{}
for _, s := range rule.Excludes {
rule.excludes[s] = struct{}{}
}
rule.score = rule.computeScore()
}
func (r Rule) maxNGram() int {
max := 0
for _, s := range r.Includes {
n := ngramLength(s)
if n > max {
max = n
}
}
for _, s := range r.Excludes {
n := ngramLength(s)
if n > max {
max = n
}
}
return max
}
func (r Rule) isExcluded(tokens []string) bool {
// This is most often the case.
if len(r.excludes) == 0 {
return false
}
for _, s := range tokens {
if _, ok := r.excludes[s]; ok {
return true
}
}
return false
}
func (r Rule) computeScore() (score int) {
for _, token := range r.Includes {
n := ngramLength(token)
score += n * (n + 1) / 2
}
return score
}
func ruleLess(lhs, rhs *Rule) bool {
// If scores differ, sort by score.
if lhs.score != rhs.score {
return lhs.score < rhs.score
}
// If include depth differs, sort by depth.
lDepth := len(lhs.Includes)
rDepth := len(rhs.Includes)
if lDepth != rDepth {
return lDepth < rDepth
}
// If exclude depth differs, sort by depth.
lDepth = len(lhs.Excludes)
rDepth = len(rhs.Excludes)
if lDepth != rDepth {
return lDepth < rDepth
}
// Sort alphabetically by includes.
for i := range lhs.Includes {
if lhs.Includes[i] != rhs.Includes[i] {
return lhs.Includes[i] < rhs.Includes[i]
}
}
// Sort by alphabetically by excludes.
for i := range lhs.Excludes {
if lhs.Excludes[i] != rhs.Excludes[i] {
return lhs.Excludes[i] < rhs.Excludes[i]
}
}
// Sort by tag.
if lhs.Tag != rhs.Tag {
return lhs.Tag < rhs.Tag
}
return false
}

57
rulegroup.go Normal file
View File

@ -0,0 +1,57 @@
package tagengine
// A RuleGroup can be converted into a list of rules. Each rule will point to
// the same tag, and have the same exclude set.
type RuleGroup struct {
Tag string
Includes [][]string
Excludes []string
Blocks []string
}
func NewRuleGroup(tag string) RuleGroup {
return RuleGroup{
Tag: tag,
Includes: [][]string{},
Excludes: []string{},
Blocks: []string{},
}
}
func (g RuleGroup) Inc(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: append(g.Includes, l),
Excludes: g.Excludes,
Blocks: g.Blocks,
}
}
func (g RuleGroup) Exc(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: g.Includes,
Excludes: l,
Blocks: g.Blocks,
}
}
func (g RuleGroup) Block(l ...string) RuleGroup {
return RuleGroup{
Tag: g.Tag,
Includes: g.Includes,
Excludes: g.Excludes,
Blocks: l,
}
}
func (rg RuleGroup) ToList() (l []Rule) {
for _, includes := range rg.Includes {
l = append(l, Rule{
Tag: rg.Tag,
Excludes: rg.Excludes,
Includes: includes,
})
}
return
}

181
ruleset.go Normal file
View File

@ -0,0 +1,181 @@
package tagengine
import (
"sort"
)
type RuleSet struct {
root *node
maxNgram int
sanitize func(...string) string
rules []*Rule
}
func NewRuleSet() *RuleSet {
return &RuleSet{
root: &node{
Token: "/",
Children: map[string]*node{},
},
sanitize: newSanitizer(),
rules: []*Rule{},
}
}
func NewRuleSetFromList(rules []Rule) *RuleSet {
rs := NewRuleSet()
rs.AddRule(rules...)
return rs
}
func (t *RuleSet) Add(ruleOrGroup ...interface{}) {
for _, ix := range ruleOrGroup {
switch x := ix.(type) {
case Rule:
t.AddRule(x)
case RuleGroup:
t.AddRuleGroup(x)
default:
panic("Add expects either Rule or RuleGroup objects.")
}
}
}
func (t *RuleSet) AddRule(rules ...Rule) {
for _, rule := range rules {
rule := rule
// Make sure rule is well-formed.
rule.normalize()
// Update maxNgram.
N := rule.maxNGram()
if N > t.maxNgram {
t.maxNgram = N
}
t.rules = append(t.rules, &rule)
t.root.AddRule(&rule)
}
}
func (t *RuleSet) AddRuleGroup(ruleGroups ...RuleGroup) {
for _, rg := range ruleGroups {
t.AddRule(rg.ToList()...)
}
}
// MatchRules will return a list of all matching rules. The rules are sorted by
// the match's "score". The best match will be first.
func (t *RuleSet) MatchRules(input string) (rules []*Rule) {
input = t.sanitize(input)
tokens := Tokenize(input, t.maxNgram)
rules = t.root.Match(tokens)
if len(rules) == 0 {
return rules
}
// Check excludes.
l := rules[:0]
for _, r := range rules {
if !r.isExcluded(tokens) {
l = append(l, r)
}
}
rules = l
// Sort rules descending.
sort.Slice(rules, func(i, j int) bool {
return ruleLess(rules[j], rules[i])
})
// Update rule stats.
if len(rules) > 0 {
rules[0].FirstCount++
for _, r := range rules {
r.MatchCount++
}
}
return rules
}
type Match struct {
Tag string
Confidence float64 // In the range (0,1].
}
// Return a list of matches with confidence.
func (t *RuleSet) Match(input string) []Match {
rules := t.MatchRules(input)
if len(rules) == 0 {
return []Match{}
}
if len(rules) == 1 {
return []Match{{
Tag: rules[0].Tag,
Confidence: 1,
}}
}
// Create list of blocked tags.
blocks := map[string]struct{}{}
for _, rule := range rules {
for _, tag := range rule.Blocks {
blocks[tag] = struct{}{}
}
}
// Remove rules for blocked tags.
iOut := 0
for _, rule := range rules {
if _, ok := blocks[rule.Tag]; ok {
continue
}
rules[iOut] = rule
iOut++
}
rules = rules[:iOut]
// Matches by index.
matches := map[string]int{}
out := []Match{}
sum := float64(0)
for _, rule := range rules {
idx, ok := matches[rule.Tag]
if !ok {
idx = len(matches)
matches[rule.Tag] = idx
out = append(out, Match{Tag: rule.Tag})
}
out[idx].Confidence += float64(rule.score)
sum += float64(rule.score)
}
for i := range out {
out[i].Confidence /= sum
}
return out
}
// ListRules returns rules used in the ruleset sorted by the rules'
// FirstCount. This is the number of times the given rule was the best match to
// an input.
func (t *RuleSet) ListRules() []*Rule {
sort.Slice(t.rules, func(i, j int) bool {
if t.rules[j].FirstCount != t.rules[i].FirstCount {
return t.rules[j].FirstCount < t.rules[i].FirstCount
}
if t.rules[j].MatchCount != t.rules[i].MatchCount {
return t.rules[j].MatchCount < t.rules[i].MatchCount
}
return t.rules[j].Tag < t.rules[i].Tag
})
return t.rules
}

84
ruleset_test.go Normal file
View File

@ -0,0 +1,84 @@
package tagengine
import (
"reflect"
"testing"
)
func TestRulesSet(t *testing.T) {
rs := NewRuleSet()
rs.AddRule(Rule{
Tag: "cc/2",
Includes: []string{"cola", "coca"},
})
rs.AddRule(Rule{
Tag: "cc/0",
Includes: []string{"coca cola"},
})
rs.AddRule(Rule{
Tag: "cz/2",
Includes: []string{"coca", "zero"},
})
rs.AddRule(Rule{
Tag: "cc0/3",
Includes: []string{"zero", "coca", "cola"},
})
rs.AddRule(Rule{
Tag: "cc0/3.1",
Includes: []string{"coca", "cola", "zero"},
Excludes: []string{"pepsi"},
})
rs.AddRule(Rule{
Tag: "spa",
Includes: []string{"spa"},
Blocks: []string{"cc/0", "cc0/3", "cc0/3.1"},
})
type TestCase struct {
Input string
Matches []Match
}
cases := []TestCase{
{
Input: "coca-cola zero",
Matches: []Match{
{"cc0/3.1", 0.3},
{"cc0/3", 0.3},
{"cz/2", 0.2},
{"cc/2", 0.2},
},
}, {
Input: "coca cola",
Matches: []Match{
{"cc/0", 0.6},
{"cc/2", 0.4},
},
}, {
Input: "coca cola zero pepsi",
Matches: []Match{
{"cc0/3", 0.3},
{"cc/0", 0.3},
{"cz/2", 0.2},
{"cc/2", 0.2},
},
}, {
Input: "fanta orange",
Matches: []Match{},
}, {
Input: "coca-cola zero / fanta / spa",
Matches: []Match{
{"cz/2", 0.4},
{"cc/2", 0.4},
{"spa", 0.2},
},
},
}
for _, tc := range cases {
matches := rs.Match(tc.Input)
if !reflect.DeepEqual(matches, tc.Matches) {
t.Fatalf("%v != %v", matches, tc.Matches)
}
}
}

125
sanitize.go Normal file
View File

@ -0,0 +1,125 @@
package tagengine
import (
"strings"
"unicode"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
"golang.org/x/text/unicode/norm"
)
func newSanitizer() func(...string) string {
diactricsFix := transform.Chain(norm.NFD, runes.Remove(runes.In(unicode.Mn)), norm.NFC)
return func(l ...string) string {
s := strings.Join(l, " ")
// Lowercase.
s = strings.ToLower(s)
// Remove apostrophes.
s = strings.ReplaceAll(s, "ß", "ss")
s = strings.ReplaceAll(s, "'s", "s")
s = strings.ReplaceAll(s, "`s", "s")
s = strings.ReplaceAll(s, "´s", "s")
// Remove diacritics.
if out, _, err := transform.String(diactricsFix, s); err == nil {
s = out
}
// Clean spaces.
s = spaceNumbers(s)
s = addSpaces(s)
s = collapseSpaces(s)
return s
}
}
func spaceNumbers(s string) string {
if len(s) == 0 {
return s
}
isDigit := func(b rune) bool {
switch b {
case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
return true
}
return false
}
b := strings.Builder{}
var first rune
for _, c := range s {
first = c
break
}
digit := isDigit(first)
// Range over runes.
for _, c := range s {
thisDigit := isDigit(c)
if thisDigit != digit {
b.WriteByte(' ')
digit = thisDigit
}
b.WriteRune(c)
}
return b.String()
}
func addSpaces(s string) string {
needsSpace := func(r rune) bool {
switch r {
case '`', '~', '!', '@', '#', '%', '^', '&', '*', '(', ')',
'-', '_', '+', '=', '[', '{', ']', '}', '\\', '|',
':', ';', '"', '\'', ',', '<', '.', '>', '?', '/':
return true
}
return false
}
b := strings.Builder{}
// Range over runes.
for _, r := range s {
if needsSpace(r) {
b.WriteRune(' ')
b.WriteRune(r)
b.WriteRune(' ')
} else {
b.WriteRune(r)
}
}
return b.String()
}
func collapseSpaces(s string) string {
// Trim leading and trailing spaces.
s = strings.TrimSpace(s)
b := strings.Builder{}
wasSpace := false
// Range over runes.
for _, c := range s {
if unicode.IsSpace(c) {
wasSpace = true
continue
} else if wasSpace {
wasSpace = false
b.WriteRune(' ')
}
b.WriteRune(c)
}
return b.String()
}

35
sanitize_test.go Normal file
View File

@ -0,0 +1,35 @@
package tagengine
import "testing"
func TestSanitize(t *testing.T) {
sanitize := newSanitizer()
type Case struct {
In string
Out string
}
cases := []Case{
{"", ""},
{"123abc", "123 abc"},
{"abc123", "abc 123"},
{"abc123xyz", "abc 123 xyz"},
{"1f2", "1 f 2"},
{" abc", "abc"},
{" ; KitKat/m&m's (bÖttle) @ ", "; kitkat / m & ms ( bottle ) @"},
{" Pott`s gin königs beer;SOJU ", "potts gin konigs beer ; soju"},
{"brot & brötchen", "brot & brotchen"},
{"Gâteau au fromage blanc, Stück", "gateau au fromage blanc , stuck"},
{"Maisels Weisse Weißbier 0,5l", "maisels weisse weissbier 0 , 5 l"},
{"Maisels´s Weisse - Hefeweizen 0,5l", "maiselss weisse - hefeweizen 0 , 5 l"},
{"€", "€"},
}
for _, tc := range cases {
out := sanitize(tc.In)
if out != tc.Out {
t.Fatalf("%v != %v", out, tc.Out)
}
}
}

63
tokenize.go Normal file
View File

@ -0,0 +1,63 @@
package tagengine
import (
"sort"
"strings"
)
var ignoreTokens = map[string]struct{}{}
func init() {
// These on their own are ignored.
tokens := []string{
"`", `~`, `!`, `@`, `#`, `%`, `^`, `&`, `*`, `(`, `)`,
`-`, `_`, `+`, `=`, `[`, `{`, `]`, `}`, `\`, `|`,
`:`, `;`, `"`, `'`, `,`, `<`, `.`, `>`, `?`, `/`,
}
for _, s := range tokens {
ignoreTokens[s] = struct{}{}
}
}
func Tokenize(
input string,
maxNgram int,
) (
tokens []string,
) {
// Avoid duplicate ngrams.
ignored := map[string]bool{}
fields := strings.Fields(input)
if len(fields) < maxNgram {
maxNgram = len(fields)
}
for i := 1; i < maxNgram+1; i++ {
jMax := len(fields) - i + 1
for j := 0; j < jMax; j++ {
ngram := strings.Join(fields[j:i+j], " ")
if _, ok := ignoreTokens[ngram]; !ok {
if _, ok := ignored[ngram]; !ok {
tokens = append(tokens, ngram)
ignored[ngram] = true
}
}
}
}
sortTokens(tokens)
return tokens
}
func sortTokens(tokens []string) {
sort.Slice(tokens, func(i, j int) bool {
if len(tokens[i]) != len(tokens[j]) {
return len(tokens[i]) < len(tokens[j])
}
return tokens[i] < tokens[j]
})
}

55
tokenize_test.go Normal file
View File

@ -0,0 +1,55 @@
package tagengine
import (
"reflect"
"testing"
)
func TestTokenize(t *testing.T) {
type Case struct {
Input string
MaxNgram int
Output []string
}
cases := []Case{
{
Input: "a bb c d",
MaxNgram: 3,
Output: []string{
"a", "c", "d", "bb",
"c d", "a bb", "bb c",
"a bb c", "bb c d",
},
}, {
Input: "a b",
MaxNgram: 3,
Output: []string{
"a", "b", "a b",
},
}, {
Input: "- b c d",
MaxNgram: 3,
Output: []string{
"b", "c", "d",
"- b", "b c", "c d",
"- b c", "b c d",
},
}, {
Input: "a a b c d c d",
MaxNgram: 3,
Output: []string{
"a", "b", "c", "d",
"a a", "a b", "b c", "c d", "d c",
"a a b", "a b c", "b c d", "c d c", "d c d",
},
},
}
for _, tc := range cases {
output := Tokenize(tc.Input, tc.MaxNgram)
if !reflect.DeepEqual(output, tc.Output) {
t.Fatalf("%s: %#v", tc.Input, output)
}
}
}