From 7a963768655e77fe17317b79030ed76e8bb53d72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toma=C5=BE=20Jerman?= Date: Thu, 7 Oct 2021 20:11:22 +0200 Subject: [PATCH] Properly handle join cases where local key is not unique --- pkg/report/step_join_strategies.go | 19 +++++- tests/reporter/join_dup_keys_test.go | 65 +++++++++++++++++++ .../testdata/join_dup_keys/data_model/aa.csv | 6 ++ .../testdata/join_dup_keys/data_model/bb.csv | 6 ++ .../join_dup_keys/data_model/datamodel.yaml | 42 ++++++++++++ .../testdata/join_dup_keys/report.json | 38 +++++++++++ 6 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 tests/reporter/join_dup_keys_test.go create mode 100644 tests/reporter/testdata/join_dup_keys/data_model/aa.csv create mode 100644 tests/reporter/testdata/join_dup_keys/data_model/bb.csv create mode 100644 tests/reporter/testdata/join_dup_keys/data_model/datamodel.yaml create mode 100644 tests/reporter/testdata/join_dup_keys/report.json diff --git a/pkg/report/step_join_strategies.go b/pkg/report/step_join_strategies.go index a33c973a5..1ee2a0355 100644 --- a/pkg/report/step_join_strategies.go +++ b/pkg/report/step_join_strategies.go @@ -361,17 +361,30 @@ func (d *joinedDataset) sortFrameBuffers(localLoader, foreignLoader *frameLoadCt // Each foreign frame is on the same index as the corresponding local row. // This simplifies later algorithms and removes the need for additional // mapping structures. + // + // We need to use a slice of indexes as we don't pull duplicated data; we need + // to do this here. + bucketsN := make(map[string][]int) + // + // use maxI for an easier determination of the maximum index. + maxI := -1 for _, l := range localBuffer { l.WalkRows(func(i int, r FrameRow) error { - buckets[cast.ToString(r[localLoader.keyColIndex].Get())] = i + k := cast.ToString(r[localLoader.keyColIndex].Get()) + bucketsN[k] = append(bucketsN[k], i) + if i > maxI { + maxI = i + } return nil }) } // // Update foreign frame order based on reordered buckets - aux := make([]*Frame, len(foreignBuffer)) + aux := make([]*Frame, maxI+1) for _, f := range foreignBuffer { - aux[buckets[f.RefValue]] = f + for _, i := range bucketsN[f.RefValue] { + aux[i] = f + } } return localBuffer, aux diff --git a/tests/reporter/join_dup_keys_test.go b/tests/reporter/join_dup_keys_test.go new file mode 100644 index 000000000..ce95da096 --- /dev/null +++ b/tests/reporter/join_dup_keys_test.go @@ -0,0 +1,65 @@ +package reporter + +import ( + "testing" + + "github.com/cortezaproject/corteza-server/pkg/report" +) + +func Test_join_dup_keys(t *testing.T) { + var ( + ctx, h, s = setup(t) + m, _, dd = loadScenarioOwnDM(ctx, s, t, h) + ff []*report.Frame + + local *report.Frame + def = dd[0] + ) + + // // // PAGE 1 + ff = loadNoErr(ctx, h, m, def) + h.a.Len(ff, 3) + local = ff[0] + + // local + h.a.Equal(2, local.Size()) + h.a.NotNil(local.Paging) + h.a.NotNil(local.Paging.NextPage) + checkRows(h, local, + "a, 11", + "a, 12") + + h.a.Equal("a", ff[1].RefValue) + h.a.Equal("a", ff[2].RefValue) + + // // // PAGE 2 + def.Paging.PageCursor = local.Paging.NextPage + ff = loadNoErr(ctx, h, m, def) + h.a.Len(ff, 3) + local = ff[0] + + // local + h.a.Equal(2, local.Size()) + h.a.NotNil(local.Paging) + h.a.NotNil(local.Paging.NextPage) + checkRows(h, local, + "a, 13", + "b, 14") + + h.a.Equal("a", ff[1].RefValue) + h.a.Equal("b", ff[2].RefValue) + + // // // PAGE 1 + def.Paging.PageCursor = local.Paging.NextPage + ff = loadNoErr(ctx, h, m, def) + h.a.Len(ff, 2) + local = ff[0] + + // local + h.a.Equal(1, local.Size()) + h.a.Nil(local.Paging) + checkRows(h, local, + "c, 15") + + h.a.Equal("c", ff[1].RefValue) +} diff --git a/tests/reporter/testdata/join_dup_keys/data_model/aa.csv b/tests/reporter/testdata/join_dup_keys/data_model/aa.csv new file mode 100644 index 000000000..836404268 --- /dev/null +++ b/tests/reporter/testdata/join_dup_keys/data_model/aa.csv @@ -0,0 +1,6 @@ +id,pk,value +1,a,11 +2,a,12 +3,a,13 +4,b,14 +5,c,15 diff --git a/tests/reporter/testdata/join_dup_keys/data_model/bb.csv b/tests/reporter/testdata/join_dup_keys/data_model/bb.csv new file mode 100644 index 000000000..c5ac22026 --- /dev/null +++ b/tests/reporter/testdata/join_dup_keys/data_model/bb.csv @@ -0,0 +1,6 @@ +id,fk,value +1,a,11 +2,a,12 +3,b,13 +4,b,14 +5,c,15 diff --git a/tests/reporter/testdata/join_dup_keys/data_model/datamodel.yaml b/tests/reporter/testdata/join_dup_keys/data_model/datamodel.yaml new file mode 100644 index 000000000..a6d5f15a2 --- /dev/null +++ b/tests/reporter/testdata/join_dup_keys/data_model/datamodel.yaml @@ -0,0 +1,42 @@ +namespaces: + ns: + name: ns name + +modules: + aa: + records: + source: aa.csv + key: id + mapping: + id: / + pk: + field: pk + value: + field: value + + fields: + pk: + label: pk label + kind: String + value: + label: value label + kind: Number + + bb: + records: + source: bb.csv + key: id + mapping: + id: / + fk: + field: fk + value: + field: value + + fields: + fk: + label: fk label + kind: String + value: + label: value label + kind: Number diff --git a/tests/reporter/testdata/join_dup_keys/report.json b/tests/reporter/testdata/join_dup_keys/report.json new file mode 100644 index 000000000..ba9adc13c --- /dev/null +++ b/tests/reporter/testdata/join_dup_keys/report.json @@ -0,0 +1,38 @@ +{ + "handle": "testing_report", + "sources": [ + { "step": { "load": { + "name": "aa", + "source": "composeRecords", + "definition": { + "module": "aa", + "namespace": "ns" + } + }}}, + { "step": { "load": { + "name": "bb", + "source": "composeRecords", + "definition": { + "module": "bb", + "namespace": "ns" + } + }}}, + + { "step": { "join": { + "name": "joined", + + "localSource": "aa", + "localColumn": "pk", + "foreignSource": "bb", + "foreignColumn": "fk" + }}} + ], + + "frames": [{ + "name": "result", + "source": "joined", + "paging": { + "limit": 2 + } + }] +}