From 81b229c08a3006cf4ad6abe146c5922cac5dc441 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 17 Feb 2018 15:18:19 +0100 Subject: [PATCH] Make new c_append_helper a bit more AMD friendly --- OpenCL/inc_common.cl | 134 +++++++++++++++++-------------------- OpenCL/inc_rp_optimized.cl | 10 +-- 2 files changed, 65 insertions(+), 79 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index ce5978024..2d3c3b407 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -427,11 +427,10 @@ DECLSPEC void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4 (w0, 0x80808080, v); break; - case 1: append_helper_1x4 (w1, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) @@ -444,12 +443,11 @@ DECLSPEC void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 off c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4 (w0, 0x80808080, v); break; - case 1: append_helper_1x4 (w1, 0x80808080, v); break; - case 2: append_helper_1x4 (w2, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) @@ -462,13 +460,12 @@ DECLSPEC void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], c c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4 (w0, 0x80808080, v); break; - case 1: append_helper_1x4 (w1, 0x80808080, v); break; - case 2: append_helper_1x4 (w2, 0x80808080, v); break; - case 3: append_helper_1x4 (w3, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) @@ -481,17 +478,16 @@ DECLSPEC void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4 (w0, 0x80808080, v); break; - case 1: append_helper_1x4 (w1, 0x80808080, v); break; - case 2: append_helper_1x4 (w2, 0x80808080, v); break; - case 3: append_helper_1x4 (w3, 0x80808080, v); break; - case 4: append_helper_1x4 (w4, 0x80808080, v); break; - case 5: append_helper_1x4 (w5, 0x80808080, v); break; - case 6: append_helper_1x4 (w6, 0x80808080, v); break; - case 7: append_helper_1x4 (w7, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v); + append_helper_1x4 (w4, ((offset16 == 4) ? 0x80808080 : 0), v); + append_helper_1x4 (w5, ((offset16 == 5) ? 0x80808080 : 0), v); + append_helper_1x4 (w6, ((offset16 == 6) ? 0x80808080 : 0), v); + append_helper_1x4 (w7, ((offset16 == 7) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_1x16 (u32x w[16], const u32 offset) @@ -504,17 +500,12 @@ DECLSPEC void append_0x80_1x16 (u32x w[16], const u32 offset) c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4 (w + 0, 0x80808080, v); break; - case 1: append_helper_1x4 (w + 4, 0x80808080, v); break; - case 2: append_helper_1x4 (w + 8, 0x80808080, v); break; - case 3: append_helper_1x4 (w + 12, 0x80808080, v); break; - case 4: append_helper_1x4 (w + 16, 0x80808080, v); break; - case 5: append_helper_1x4 (w + 20, 0x80808080, v); break; - case 6: append_helper_1x4 (w + 24, 0x80808080, v); break; - case 7: append_helper_1x4 (w + 28, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4 (w + 0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4 (w + 4, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4 (w + 8, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4 (w + 12, ((offset16 == 3) ? 0x80808080 : 0), v); } DECLSPEC void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) @@ -30981,11 +30972,10 @@ DECLSPEC void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4_S (w0, 0x01010101, v); break; - case 1: append_helper_1x4_S (w1, 0x01010101, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); } DECLSPEC void append_0x80_1x4_S (u32 w0[4], const u32 offset) @@ -31011,11 +31001,10 @@ DECLSPEC void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4_S (w0, 0x80808080, v); break; - case 1: append_helper_1x4_S (w1, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) @@ -31028,12 +31017,11 @@ DECLSPEC void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offs c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4_S (w0, 0x80808080, v); break; - case 1: append_helper_1x4_S (w1, 0x80808080, v); break; - case 2: append_helper_1x4_S (w2, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) @@ -31046,13 +31034,12 @@ DECLSPEC void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], con c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4_S (w0, 0x80808080, v); break; - case 1: append_helper_1x4_S (w1, 0x80808080, v); break; - case 2: append_helper_1x4_S (w2, 0x80808080, v); break; - case 3: append_helper_1x4_S (w3, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) @@ -31065,17 +31052,16 @@ DECLSPEC void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4_S (w0, 0x80808080, v); break; - case 1: append_helper_1x4_S (w1, 0x80808080, v); break; - case 2: append_helper_1x4_S (w2, 0x80808080, v); break; - case 3: append_helper_1x4_S (w3, 0x80808080, v); break; - case 4: append_helper_1x4_S (w4, 0x80808080, v); break; - case 5: append_helper_1x4_S (w5, 0x80808080, v); break; - case 6: append_helper_1x4_S (w6, 0x80808080, v); break; - case 7: append_helper_1x4_S (w7, 0x80808080, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); + append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); + append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); + append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); + append_helper_1x4_S (w4, ((offset16 == 4) ? 0x80808080 : 0), v); + append_helper_1x4_S (w5, ((offset16 == 5) ? 0x80808080 : 0), v); + append_helper_1x4_S (w6, ((offset16 == 6) ? 0x80808080 : 0), v); + append_helper_1x4_S (w7, ((offset16 == 7) ? 0x80808080 : 0), v); } DECLSPEC void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index c79009cf9..576fda1bc 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -766,11 +766,11 @@ void append_block1 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_r0 c_append_helper_mini[offset & 0xf][3] }; - switch (offset / 16) - { - case 0: append_helper_1x4 (buf0, tmp, v); break; - case 1: append_helper_1x4 (buf1, tmp, v); break; - } + const u32 offset16 = offset / 16; + + append_helper_1x4_S (buf0, ((offset16 == 0) ? tmp : 0), v); + append_helper_1x4_S (buf1, ((offset16 == 1) ? tmp : 0), v); + } void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 src_l0[4], const u32 src_l1[4], const u32 src_r0[4], const u32 src_r1[4])