From e7fa07cfae7d66f92da933f7c4666a2d8f30fd4e Mon Sep 17 00:00:00 2001 From: Vivek Patel Date: Thu, 25 Aug 2022 13:59:02 +0530 Subject: [PATCH] Extends DeDup capabilities Support modifier for value search and allow ability to select matching criteria for multi value field - Removed name from rule for now - Value modifier to search with are ignore-case, case-sensitive, fuzzy-search, sounds-like - Multi value matching criteria are one-of, equal - Migrate RecordDeDup config for module, by adding upgrade fix for module.config.recordDeDup to migrate as per to the latest DeDupRule struct. --- server/compose/service/record.go | 60 ++++--- server/compose/types/module.go | 5 +- .../compose/types/record_detect_duplicates.go | 160 ++++++++++++------ .../types/record_detect_duplicates_test.go | 126 ++++++++++++++ server/compose/types/validated.go | 21 +++ server/pkg/str/levenshtein.go | 52 ++++++ server/pkg/str/levenshtein_test.go | 39 +++++ server/pkg/str/soundex.go | 104 ++++++++++++ server/pkg/str/soundex_test.go | 64 +++++++ server/pkg/str/str.go | 31 ++++ server/store/adapters/rdbms/upgrade_fixes.go | 130 +++++++++++++- 11 files changed, 716 insertions(+), 76 deletions(-) create mode 100644 server/compose/types/record_detect_duplicates_test.go create mode 100644 server/pkg/str/levenshtein.go create mode 100644 server/pkg/str/levenshtein_test.go create mode 100644 server/pkg/str/soundex.go create mode 100644 server/pkg/str/soundex_test.go create mode 100644 server/pkg/str/str.go diff --git a/server/compose/service/record.go b/server/compose/service/record.go index 3a6d54669..75af91220 100644 --- a/server/compose/service/record.go +++ b/server/compose/service/record.go @@ -571,6 +571,12 @@ func (svc record) Bulk(ctx context.Context, oo ...*types.RecordBulkOperation) (r // before we start storing any changes rves = &types.RecordValueErrorSet{} + // duplication errors + ddes = &types.RecordValueErrorSet{} + + // merge of record value errors and duplication errors + ee = &types.RecordValueErrorSet{} + action func(props ...*recordActionProps) *recordAction r *types.Record @@ -599,11 +605,11 @@ func (svc record) Bulk(ctx context.Context, oo ...*types.RecordBulkOperation) (r switch p.Operation { case types.OperationTypeCreate: action = RecordActionCreate - r, dd, err = svc.create(ctx, r) + r, ddes, err = svc.create(ctx, r) case types.OperationTypeUpdate: action = RecordActionUpdate - r, dd, err = svc.update(ctx, r) + r, ddes, err = svc.update(ctx, r) case types.OperationTypeDelete: action = RecordActionDelete @@ -613,8 +619,13 @@ func (svc record) Bulk(ctx context.Context, oo ...*types.RecordBulkOperation) (r aProp.setChanged(r) // Attach meta ID to each value error for FE identification - if !dd.HasStrictErrors() && r != nil { - dd.SetMetaID(r.ID) + if !ddes.HasStrictErrors() && r != nil { + ddes.SetMetaID(r.ID) + } + if !ddes.IsValid() && dd == nil { + dd = ddes + } else { + dd.Merge(ddes) } if rve := types.IsRecordValueErrorSet(err); rve != nil { @@ -644,9 +655,14 @@ func (svc record) Bulk(ctx context.Context, oo ...*types.RecordBulkOperation) (r } } - if !rves.IsValid() { + // merge record value errors and strict duplication errors + if dd.HasStrictErrors() { + ee.Merge(rves, dd) + } + + if !ee.IsValid() { // Any errors gathered? - return RecordErrValueInput().Wrap(rves) + return RecordErrValueInput().Wrap(ee) } return nil @@ -701,10 +717,19 @@ func (svc record) create(ctx context.Context, new *types.Record) (rec *types.Rec new.SetModule(m) { + // handle deDup error/warnings + dd, err = svc.DupDetection(ctx, m, new) + + // handle input payload errors if rve = svc.procCreate(ctx, invokerID, m, new); !rve.IsValid() { return nil, dd, RecordErrValueInput().Wrap(rve) } + // record value errors from dup detection + if err != nil { + return + } + if err = svc.eventbus.WaitFor(ctx, event.RecordBeforeCreate(new, nil, m, ns, rve, nil)); err != nil { return } else if !rve.IsValid() { @@ -714,11 +739,6 @@ func (svc record) create(ctx context.Context, new *types.Record) (rec *types.Rec new.Values = RecordValueDefaults(m, new.Values) - dd, err = svc.DupDetection(ctx, m, new) - if err != nil { - return - } - // Handle payload from automation scripts if rve = svc.procCreate(ctx, invokerID, m, new); !rve.IsValid() { return nil, dd, RecordErrValueInput().Wrap(rve) @@ -996,17 +1016,20 @@ func (svc record) update(ctx context.Context, upd *types.Record) (rec *types.Rec upd.SetModule(m) old.SetModule(m) - dd, err = svc.DupDetection(ctx, m, upd) - if err != nil { - return - } - { - // Handle input payload + // handle deDup error/warnings + dd, err = svc.DupDetection(ctx, m, upd) + + // handle input payload errors if rve = svc.procUpdate(ctx, invokerID, m, upd, old); !rve.IsValid() { return nil, dd, RecordErrValueInput().Wrap(rve) } + // record value errors from dup detection + if err != nil { + return + } + // Scripts can (besides simple error value) return complex record value error set // that is passed back to the UI or any other API consumer // @@ -1833,7 +1856,6 @@ func (svc record) DupDetection(ctx context.Context, m *types.Module, rec *types. return } - // @todo: improve error string with details rProps.setValueErrors(out) // Error out if duplicate record exist @@ -2036,7 +2058,7 @@ fields: val.Value = pickRandomID(recRefs[refModID]) case "select": - //val.Value = src.Select(f.Options) + // val.Value = src.Select(f.Options) continue fields case "url": diff --git a/server/compose/types/module.go b/server/compose/types/module.go index 2b1c726d3..5e8a68bb3 100644 --- a/server/compose/types/module.go +++ b/server/compose/types/module.go @@ -93,12 +93,9 @@ type ( } ModuleConfigRecordDeDup struct { - // enable or disable duplicate detection - Enabled bool `json:"enabled"` - // strictly restrict record saving // otherwise show a warning with list of duplicated records - Strict bool `json:"strict"` + Strict bool `json:"-"` // list of duplicate detection rules applied to module's fields Rules DeDupRuleSet `json:"rules,omitempty"` diff --git a/server/compose/types/record_detect_duplicates.go b/server/compose/types/record_detect_duplicates.go index b597bd9ee..2b093b9af 100644 --- a/server/compose/types/record_detect_duplicates.go +++ b/server/compose/types/record_detect_duplicates.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "github.com/cortezaproject/corteza/server/pkg/locale" + "github.com/cortezaproject/corteza/server/pkg/str" "github.com/spf13/cast" "strings" ) @@ -18,23 +19,44 @@ type ( } DeDupRule struct { - Name DeDupRuleName `json:"name"` - Strict bool `json:"strict"` - Attributes []string `json:"attributes"` + Name DeDupRuleName `json:"name"` + Strict bool `json:"strict"` + ErrorMessage string `json:"errorMessage"` + ConstraintSet DeDupRuleConstraintSet `json:"constraints"` } + DeDupRuleConstraint struct { + Attribute string `json:"attribute"` + Modifier DeDupValueModifier `json:"modifier"` + MultiValue DeDupMultiValueConstraint `json:"multiValue"` + } + + DeDupRuleConstraintSet []*DeDupRuleConstraint + // DeDupRuleName represent the identifier for duplicate detection rule DeDupRuleName string + // DeDupValueModifier represent the algorithm used to check value string + DeDupValueModifier string + + // DeDupMultiValueConstraint for matching multi values accordingly + DeDupMultiValueConstraint string + // DeDupIssueKind based on strict mode rule or duplication config DeDupIssueKind string ) const ( - caseSensitive DeDupRuleName = "case-sensitive" + ignoreCase DeDupValueModifier = "ignore-case" + caseSensitive DeDupValueModifier = "case-sensitive" + fuzzyMatch DeDupValueModifier = "fuzzy-match" + soundsLike DeDupValueModifier = "sounds-like" - dupWarning DeDupIssueKind = "duplication_warning" - dupError DeDupIssueKind = "duplication_error" + oneOf DeDupMultiValueConstraint = "one-of" + equal DeDupMultiValueConstraint = "equal" + + deDupWarning DeDupIssueKind = "duplication_warning" + deDupError DeDupIssueKind = "duplication_error" ) func DeDup() *deDup { @@ -47,7 +69,7 @@ func (d deDup) CheckDuplication(ctx context.Context, rules DeDupRuleSet, rec Rec out = &RecordValueErrorSet{} err = rules.Walk(func(rule *DeDupRule) error { if rule.HasAttributes() { - values := rr.GetValuesByName(distinct(rule.Attributes)...) + values := rr.GetValuesByName(distinct(rule.Attributes())...) set := rule.validateValue(ctx, d.ls, rec, values) @@ -72,7 +94,14 @@ func (rule DeDupIssueKind) String() string { } func (rule DeDupRule) HasAttributes() bool { - return len(rule.Attributes) > 0 + return len(rule.ConstraintSet) > 0 && len(rule.Attributes()) > 0 +} + +func (rule DeDupRule) Attributes() (out []string) { + for _, c := range rule.ConstraintSet { + out = append(out, c.Attribute) + } + return } func (rule DeDupRule) IsStrict() bool { @@ -80,9 +109,9 @@ func (rule DeDupRule) IsStrict() bool { } func (rule DeDupRule) IssueKind() string { - out := dupWarning + out := deDupWarning if rule.Strict { - out = dupError + out = deDupError } return out.String() @@ -93,43 +122,56 @@ func (rule DeDupRule) IssueMessage() (out string) { } func (rule DeDupRule) String() string { - return fmt.Sprintf("%s duplicate detection on `%s` field", rule.Name, strings.Join(rule.Attributes, ", ")) + return fmt.Sprintf("%s duplicate detection on `%s` field", rule.Name, strings.Join(rule.Attributes(), ", ")) } // validateValue will check duplicate detection based on rules name func (rule DeDupRule) validateValue(ctx context.Context, ls localeService, rec Record, vv RecordValueSet) (out *RecordValueErrorSet) { - switch rule.Name { - case caseSensitive: - return rule.checkCaseSensitiveDuplication(ctx, ls, rec, vv) - default: - return rule.checkCaseSensitiveDuplication(ctx, ls, rec, vv) - } + return rule.checkCaseSensitiveDuplication(ctx, ls, rec, vv) } func (rule DeDupRule) checkCaseSensitiveDuplication(ctx context.Context, ls localeService, rec Record, vv RecordValueSet) (out *RecordValueErrorSet) { - out = &RecordValueErrorSet{} - recVal := rec.Values + var ( + recVal = rec.Values + ) - for _, a := range rule.Attributes { - rv := recVal.Get(a, 0) - if rv == nil { + for _, c := range rule.ConstraintSet { + rvv := recVal.FilterByName(c.Attribute) + if rvv.Len() == 0 { continue } + var ( + valErr = &RecordValueErrorSet{} + ) + _ = vv.Walk(func(v *RecordValue) error { if v.RecordID != rec.ID { - if toLower(v.Value) == toLower(rv.Value) { - out.Push(RecordValueError{ - Kind: rule.IssueKind(), - Message: ls.T(ctx, "compose", rule.IssueMessage()), - Meta: map[string]interface{}{ - "field": v.Name, - "value": v.Value, - "dupValueField": rv.Name, - "recordID": cast.ToString(v.RecordID), - "rule": rule.String(), - }, - }) + _ = rvv.Walk(func(rv *RecordValue) error { + if len(rv.Value) > 0 && matchValue(c.Modifier, rv.Value, v.Value) { + valErr.Push(RecordValueError{ + Kind: rule.IssueKind(), + Message: ls.T(ctx, "compose", rule.IssueMessage()), + Meta: map[string]interface{}{ + "field": v.Name, + "value": v.Value, + "dupValueField": rv.Name, + "recordID": cast.ToString(v.RecordID), + "rule": rule.String(), + }, + }) + } + return nil + }) + + // 1. multiValue is empty, then all value needs to be a match then return error/warning + // 2. multiValue is oneOf, then one or more value needs to be a match then return error/warning + // 3. multiValue is equal, then all value needs to be a match then return error/warning + if (!valErr.IsValid() && (!c.HasMultiValue() || c.IsAllEqual()) && valErr.Len() == rvv.Len()) || (c.IsOneOf() && valErr.Len() > 0) { + if out == nil { + out = &RecordValueErrorSet{} + } + out.Push(valErr.Set...) } } return nil @@ -139,6 +181,23 @@ func (rule DeDupRule) checkCaseSensitiveDuplication(ctx context.Context, ls loca return } +func (c DeDupRuleConstraint) HasMultiValue() bool { + switch c.MultiValue { + case oneOf, equal: + return true + default: + return false + } +} + +func (c DeDupRuleConstraint) IsAllEqual() bool { + return c.MultiValue == equal +} + +func (c DeDupRuleConstraint) IsOneOf() bool { + return c.MultiValue == oneOf +} + func (v *RecordValueErrorSet) SetMetaID(id uint64) { if v.IsValid() { return @@ -154,21 +213,7 @@ func (v *RecordValueErrorSet) SetMetaID(id uint64) { } func (v *RecordValueErrorSet) HasStrictErrors() bool { - return v.HasKind(dupError.String()) -} - -// CaseSensitiveDuplicationRule prepares the case-sensitive duplicate detection rule -func CaseSensitiveDuplicationRule(strict bool, identifiers ...string) DeDupRule { - return makeDuplicationRule(caseSensitive, strict, identifiers...) -} - -// makeDuplicationRule prepares duplication detection rules -func makeDuplicationRule(name DeDupRuleName, strict bool, attributes ...string) DeDupRule { - return DeDupRule{ - Name: name, - Strict: strict, - Attributes: attributes, - } + return v.HasKind(deDupError.String()) } // distinct only list the different (distinct) values @@ -183,6 +228,19 @@ func distinct(input []string) (out []string) { return } -func toLower(s string) string { - return strings.ToLower(s) +// matchValue will check if the input matches with target string as per the modifier +func matchValue(modifier DeDupValueModifier, input string, target string) bool { + switch modifier { + case ignoreCase: + return str.Match(input, target, str.CaseInSensitiveMatch) + case caseSensitive: + return str.Match(input, target, str.CaseSensitiveMatch) + case fuzzyMatch: + return str.Match(input, target, str.LevenshteinDistance) + case soundsLike: + return str.Match(input, target, str.Soundex) + default: + // ignoreCase as default, if not specified + return str.Match(input, target, str.CaseInSensitiveMatch) + } } diff --git a/server/compose/types/record_detect_duplicates_test.go b/server/compose/types/record_detect_duplicates_test.go new file mode 100644 index 000000000..f50f8b0a8 --- /dev/null +++ b/server/compose/types/record_detect_duplicates_test.go @@ -0,0 +1,126 @@ +package types + +import ( + "context" + "github.com/cortezaproject/corteza/server/pkg/locale" + "github.com/spf13/cast" + "github.com/stretchr/testify/require" + "testing" +) + +func TestDeDupRule_checkCaseSensitiveDuplication(t *testing.T) { + var ( + req = require.New(t) + ctx = context.Background() + ls = locale.Global() + + rule1 = DeDupRule{ + Name: "", + Strict: true, + ConstraintSet: []*DeDupRuleConstraint{ + { + Attribute: "name", + Modifier: ignoreCase, + }, + }, + } + + tests = []struct { + name string + rule DeDupRule + rec Record + vv RecordValueSet + wantOut *RecordValueErrorSet + }{ + { + name: "no duplication", + rule: rule1, + rec: Record{ + ID: 1, + Values: RecordValueSet{ + &RecordValue{ + RecordID: 1, + Name: "name", + Value: "test", + }, + }, + }, + vv: RecordValueSet{ + &RecordValue{ + RecordID: 2, + Name: "name", + Value: "test", + }, + }, + wantOut: &RecordValueErrorSet{ + Set: []RecordValueError{ + { + Kind: deDupError.String(), + Message: rule1.IssueMessage(), + Meta: map[string]interface{}{ + "field": "name", + "value": "test", + "dupValueField": "name", + "recordID": cast.ToString(2), + "rule": rule1.String(), + }, + }, + }, + }, + }, + } + ) + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotOut := tt.rule.checkCaseSensitiveDuplication(ctx, ls, tt.rec, tt.vv) + req.Equal(tt.wantOut, gotOut, "checkCaseSensitiveDuplication() = %v, want %v", gotOut, tt.wantOut) + }) + } +} + +func Test_matchValue(t *testing.T) { + tests := []struct { + name string + input string + target string + modifier DeDupValueModifier + want bool + }{ + { + name: "ignoreCase match value", + input: "test", + target: "tEst", + modifier: ignoreCase, + want: true, + }, + { + name: "caseSensitive match value", + input: "tEst", + target: "tEst", + modifier: caseSensitive, + want: true, + }, + { + name: "fuzzyMatch match value", + input: "kitten", + target: "sitting", + modifier: fuzzyMatch, + want: true, + }, + { + name: "soundsLike match value", + input: "Robert", + target: "Rupert", + modifier: soundsLike, + want: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := matchValue(tt.modifier, tt.input, tt.target); got != tt.want { + t.Errorf("matchValue() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/server/compose/types/validated.go b/server/compose/types/validated.go index 3855404d8..5db2816b5 100644 --- a/server/compose/types/validated.go +++ b/server/compose/types/validated.go @@ -29,6 +29,13 @@ func (v *RecordValueErrorSet) IsValid() bool { return v == nil || len(v.Set) == 0 } +func (v *RecordValueErrorSet) Len() int { + if v == nil { + return 0 + } + return len(v.Set) +} + func (v *RecordValueErrorSet) Error() string { var no = 0 if v != nil { @@ -62,6 +69,20 @@ func (v *RecordValueErrorSet) HasKind(kind string) bool { return false } +func (v *RecordValueErrorSet) Merge(errs ...*RecordValueErrorSet) { + if v == nil { + return + } + + for _, e := range errs { + if e == nil || e.IsValid() { + continue + } + + v.Push(e.Set...) + } +} + // IsRecordValueErrorSet tests if given error is RecordValueErrorSet (or it wraps it) and it has errors // If not is not (or !IsValid), it return nil! func IsRecordValueErrorSet(err error) *RecordValueErrorSet { diff --git a/server/pkg/str/levenshtein.go b/server/pkg/str/levenshtein.go new file mode 100644 index 000000000..b8406e763 --- /dev/null +++ b/server/pkg/str/levenshtein.go @@ -0,0 +1,52 @@ +package str + +// write Levenshtein Distance search algorithm for strings +// https://en.wikipedia.org/wiki/Levenshtein_distance +func ToLevenshteinDistance(a, b string) int { + var ( + // length of a + la = len(a) + // length of b + lb = len(b) + // distance matrix + d = make([][]int, la+1) + ) + + // initialize distance matrix + for i := 0; i <= la; i++ { + d[i] = make([]int, lb+1) + d[i][0] = i + } + + for j := 0; j <= lb; j++ { + d[0][j] = j + } + + // calculate distance matrix + for i := 1; i <= la; i++ { + for j := 1; j <= lb; j++ { + if a[i-1] == b[j-1] { + d[i][j] = d[i-1][j-1] + } else { + // fix this min function + d[i][j] = min(d[i-1][j]+1, d[i][j-1]+1, d[i-1][j-1]+1) + } + } + } + + return d[la][lb] +} + +func min(a, b, c int) int { + if a < b { + if a < c { + return a + } + } + + if b < c { + return b + } + + return c +} diff --git a/server/pkg/str/levenshtein_test.go b/server/pkg/str/levenshtein_test.go new file mode 100644 index 000000000..1178d68c8 --- /dev/null +++ b/server/pkg/str/levenshtein_test.go @@ -0,0 +1,39 @@ +package str + +import ( + "testing" +) + +func TestLevenshteinDistance(t *testing.T) { + tests := []struct { + a string + b string + want int + }{ + {"", "hello", 5}, + {"hello", "", 5}, + {"hello", "hello", 0}, + {"ab", "aa", 1}, + {"ab", "ba", 2}, + {"ab", "aaa", 2}, + {"bbb", "a", 3}, + {"kitten", "sitting", 3}, + {"distance", "difference", 5}, + {"levenshtein", "frankenstein", 6}, + {"resume and cafe", "resumes and cafes", 2}, + {"a very long string that is meant to exceed", "another very long string that is meant to exceed", 6}, + // Testing acutes and umlauts + {"resumé and café", "resumés and cafés", 2}, + {"resume and cafe", "resumé and café", 4}, + {"Hafþór Júlíus Björnsson", "Hafþor Julius Bjornsson", 8}, + // Only 2 characters are less in the 2nd string + {"།་གམ་འས་པ་་མ།", "།་གམའས་པ་་མ", 6}, + } + for _, tt := range tests { + t.Run(tt.a, func(t *testing.T) { + if got := ToLevenshteinDistance(tt.a, tt.b); got != tt.want { + t.Errorf("LevenshteinDistance() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/server/pkg/str/soundex.go b/server/pkg/str/soundex.go new file mode 100644 index 000000000..6c11bf652 --- /dev/null +++ b/server/pkg/str/soundex.go @@ -0,0 +1,104 @@ +package str + +import ( + "strings" +) + +// ToSoundex takes a word and returns the soundex code for it. +// https://en.wikipedia.org/wiki/Soundex +// +// 1. Retain the first letter of the name and drop all other occurrences of a, e, i, o, u, y, h, w. +// 2. Replace consonants with digits as follows (after the first letter): +// b, f, p, v → 1 +// c, g, j, k, q, s, x, z → 2 +// d, t → 3 +// l → 4 +// m, n → 5 +// r → 6 +// 3. If two or more letters with the same number are adjacent in the original name (before step 1), +// only retain the first letter; also two letters with the same number separated +// by 'h' or 'w' are coded as a single number, whereas such letters separated by a vowel are coded twice. +// This rule also applies to the first letter. +// 4. Iterate the previous step until you have one letter and three numbers. +// If you have too few letters in your word that you can't assign three numbers, append with zeros +// until there are three numbers. If you have more than 3 letters, just retain the first 3 numbers. +func ToSoundex(s string) string { + var ( + // soundex code + code string + // last code + lastCode string + // last rune + lastRune rune + // last rune is vowel + lastRuneIsVowel bool + ) + + // retain the first letter of the name and drop all other occurrences of a, e, i, o, u, y, h, w + for _, r := range s { + if r == 'a' || r == 'e' || r == 'i' || r == 'o' || r == 'u' || r == 'y' || r == 'h' || r == 'w' { + continue + } + + code = string(r) + break + } + + // replace consonants with digits as follows (after the first letter) + for _, r := range s { + if r == 'a' || r == 'e' || r == 'i' || r == 'o' || r == 'u' || r == 'y' || r == 'h' || r == 'w' { + lastRuneIsVowel = true + continue + } + + if lastRuneIsVowel { + lastRuneIsVowel = false + lastCode = "" + } + + switch r { + case 'b', 'f', 'p', 'v': + lastCode = "1" + case 'c', 'g', 'j', 'k', 'q', 's', 'x', 'z': + lastCode = "2" + case 'd', 't': + lastCode = "3" + case 'l': + lastCode = "4" + case 'm', 'n': + lastCode = "5" + case 'r': + lastCode = "6" + } + + if lastCode != "" && lastCode != string(lastRune) { + code += lastCode + } + + lastRune = r + } + + // if two or more letters with the same number are adjacent in the original name (before step 1), + // only retain the first letter + // also two letters with the same number separated by 'h' or 'w' are coded as a single number, + // whereas such letters separated by a vowel are coded twice + // this rule also applies to the first letter + code = strings.ReplaceAll(code, "11", "1") + code = strings.ReplaceAll(code, "22", "2") + code = strings.ReplaceAll(code, "33", "3") + code = strings.ReplaceAll(code, "44", "4") + code = strings.ReplaceAll(code, "55", "5") + code = strings.ReplaceAll(code, "66", "6") + + // iterate the previous step until you have one letter and three numbers + // if you have too few letters in your word that you can't assign three numbers, + // append with zeros until there are three numbers + // if you have more than 3 letters, just retain the first 3 numbers + if len(code) < 4 { + code += strings.Repeat("0", 4-len(code)) + } else { + code = code[:4] + } + + return code +} diff --git a/server/pkg/str/soundex_test.go b/server/pkg/str/soundex_test.go new file mode 100644 index 000000000..05423b10e --- /dev/null +++ b/server/pkg/str/soundex_test.go @@ -0,0 +1,64 @@ +package str + +import ( + "testing" +) + +func Test_soundex(t *testing.T) { + tests := []struct { + name string + want string + }{ + { + "Robert", + "R163", + }, + { + "Rupert", + "R163", + }, + { + "Rubin", + "R150", + }, + { + "Ashcraft", + "A261", + }, + { + "Ashcroft", + "A261", + }, + { + "Tymczak", + "T522", + }, + { + "Pfister", + "P123", + }, + { + "AH KEY", + "A000", + }, + { + "The quick brown fox", + "T221", + }, + { + "h3110 w021d", + "3000", + }, + { + "1337", + "1000", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ToSoundex(tt.name); got != tt.want { + t.Errorf("soundex() = %v, want %v", got, tt.want) + } + }) + } +} diff --git a/server/pkg/str/str.go b/server/pkg/str/str.go new file mode 100644 index 000000000..8a5efd327 --- /dev/null +++ b/server/pkg/str/str.go @@ -0,0 +1,31 @@ +package str + +import ( + "strings" +) + +const ( + // DefaultLevenshteinDistance is the default levenshtein distance + DefaultLevenshteinDistance = 3 + + CaseInSensitiveMatch = iota + CaseSensitiveMatch + LevenshteinDistance + Soundex +) + +// Match will match string as per given algorithm +func Match(str1, str2 string, algorithm int) bool { + switch algorithm { + case LevenshteinDistance: + return ToLevenshteinDistance(str1, str2) <= DefaultLevenshteinDistance + case Soundex: + return ToSoundex(str1) == ToSoundex(str2) + case CaseSensitiveMatch: + return strings.Compare(str1, str2) == 0 + case CaseInSensitiveMatch: + return strings.EqualFold(str1, str2) + default: + return false + } +} diff --git a/server/store/adapters/rdbms/upgrade_fixes.go b/server/store/adapters/rdbms/upgrade_fixes.go index 6e130731b..c97d68612 100644 --- a/server/store/adapters/rdbms/upgrade_fixes.go +++ b/server/store/adapters/rdbms/upgrade_fixes.go @@ -42,6 +42,7 @@ var ( fix_2022_09_00_addRevisionOnComposeRecords, fix_2022_09_00_addMetaOnComposeRecords, fix_2022_09_00_addMissingNodeIdOnFederationMapping, + fix_2023_03_00_migrateComposeModuleConfigForRecordDeDup, } ) @@ -208,7 +209,7 @@ func fix_2022_09_00_migrateOldComposeRecordValues(ctx context.Context, s *Store) err = func() (err error) { query = fmt.Sprintf(recordsPerModule, mod.NamespaceID, mod.ID, sliceLastRecordID, recordSliceSize) - //println(query) + // println(query) rows, err = s.DB.QueryContext(ctx, query) if err != nil { return @@ -237,7 +238,7 @@ func fix_2022_09_00_migrateOldComposeRecordValues(ctx context.Context, s *Store) } query = fmt.Sprintf(recValuesPerModule, strings.Join(recordIDs, ",")) - //println(query) + // println(query) rows, err = s.DB.QueryContext(ctx, query) if err != nil { return @@ -429,6 +430,131 @@ func fix_2022_09_00_addMissingNodeIdOnFederationMapping(ctx context.Context, s * ) } +func fix_2023_03_00_migrateComposeModuleConfigForRecordDeDup(ctx context.Context, s *Store) (err error) { + type ( + oldRule struct { + Name string `json:"name"` + Strict bool `json:"strict"` + Attributes []string `json:"attributes"` + } + rules struct { + Rules []oldRule `json:"rules"` + } + ) + + var ( + log = s.log(ctx) + query string + aux []byte + rr rules + rows *sql.Rows + modules types.ModuleSet + ) + + _, err = s.DataDefiner.TableLookup(ctx, model.Module.Ident) + if err != nil { + if errors.IsNotFound(err) { + log.Debug("skipping module config recordDeDup migration: compose_module table not found") + return nil + } + return err + } + + const ( + moduleConfigRecordDeDup = ` + SELECT compose_module.config -> 'recordDeDup' AS recordDeDup + FROM compose_module + WHERE compose_module.id = %d` + ) + + modules, _, err = s.SearchComposeModules(ctx, types.ModuleFilter{}) + if err != nil { + return + } + + // 1. Check if module has recordDeDup rules + // 2. If yes, migrate them to new format + // 3. Save module + for _, m := range modules { + var ( + migratedRules types.DeDupRuleSet + ) + + if err = s.Tx(ctx, func(ctx context.Context, s store.Storer) (err error) { + log.Info("collecting module.config.recordDeDup for module", zap.Uint64("id", m.ID)) + + query = fmt.Sprintf(moduleConfigRecordDeDup, m.ID) + rows, err = s.(*Store).DB.QueryContext(ctx, query) + if err != nil { + return + } + + defer func() { + // assign error to return value... + err = rows.Close() + }() + + for rows.Next() { + if err = rows.Err(); err != nil { + log.Info("failed to scan rows to migrated module.config.recordDeDup for module", + zap.Uint64("id", m.ID)) + return + } + + err = rows.Scan(&aux) + if err != nil { + continue + } + + err = json.Unmarshal(aux, &rr) + if err != nil { + continue + } + } + + for _, r := range rr.Rules { + if len(r.Attributes) == 0 { + continue + } + + var rcc types.DeDupRuleConstraintSet + for _, atr := range r.Attributes { + if len(atr) == 0 { + continue + } + + rcc = append(rcc, &types.DeDupRuleConstraint{ + Attribute: atr, + Modifier: "ignore-case", + MultiValue: "equal", + }) + } + + migratedRules = append(migratedRules, &types.DeDupRule{ + Strict: r.Strict, + ConstraintSet: rcc, + }) + } + + if len(migratedRules) > 0 { + m.Config.RecordDeDup.Rules = migratedRules + + log.Info("saving migrated module.config.recordDeDup for module", zap.Uint64("id", m.ID)) + if err = s.UpdateComposeModule(ctx, m); err != nil { + log.Info("error saving migrated module.config.recordDeDup for module", zap.Uint64("id", m.ID)) + return + } + } + + return + }); err != nil { + continue + } + } + + return +} + func count(ctx context.Context, s *Store, table string, ee ...goqu.Expression) (count int) { db := s.DB.(goqu.SQLDatabase)