From 4e0972ce3af5848cc4085604d8d722ea58c692b5 Mon Sep 17 00:00:00 2001 From: jsteube Date: Fri, 14 Jul 2017 13:24:40 +0200 Subject: [PATCH] Add xxx_update_vector_swap(), xxx_update_vector_utf16le_swap() and xxx_update_vector_utf16beN() for later use --- OpenCL/inc_common.cl | 114 ++++++++---- OpenCL/inc_hash_md4.cl | 162 ++++++++++++++++ OpenCL/inc_hash_md5.cl | 162 ++++++++++++++++ OpenCL/inc_hash_ripemd160.cl | 162 ++++++++++++++++ OpenCL/inc_hash_sha1.cl | 204 ++++++++++++++++++++ OpenCL/inc_hash_sha256.cl | 204 ++++++++++++++++++++ OpenCL/inc_hash_sha384.cl | 352 +++++++++++++++++++++++++++++++++++ OpenCL/inc_hash_sha512.cl | 352 +++++++++++++++++++++++++++++++++++ OpenCL/inc_hash_whirlpool.cl | 162 ++++++++++++++++ 9 files changed, 1842 insertions(+), 32 deletions(-) diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index d6e0501be..3e3c3ce91 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -1990,14 +1990,39 @@ inline void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) #endif #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); - out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); - out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); - out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); - out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); - out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); - out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); - out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); + out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); + out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); + out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); + out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); + out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); + out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); + out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + #endif +} + +inline void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) +{ + #ifdef IS_NV + out2[3] = __byte_perm (in[3], 0, 0x1707); + out2[2] = __byte_perm (in[3], 0, 0x3727); + out2[1] = __byte_perm (in[2], 0, 0x1707); + out2[0] = __byte_perm (in[2], 0, 0x3727); + out1[3] = __byte_perm (in[1], 0, 0x1707); + out1[2] = __byte_perm (in[1], 0, 0x3727); + out1[1] = __byte_perm (in[0], 0, 0x1707); + out1[0] = __byte_perm (in[0], 0, 0x3727); + #endif + + #if defined IS_AMD || defined IS_GENERIC + out2[3] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); + out2[2] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); + out2[1] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); + out2[0] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); + out1[3] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); + out1[2] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); + out1[1] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + out1[0] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); #endif } @@ -2015,14 +2040,14 @@ inline void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) #endif #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); - out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); - out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); - out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); - out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); - out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); - out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); - out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); + out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); + out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); + out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); + out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); + out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); + out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); + out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); + out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); #endif } @@ -14914,14 +14939,39 @@ inline void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); - out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); - out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); - out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); - out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); - out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); - out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); - out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); + out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); + out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); + out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); + out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); + out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); + out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); + out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + #endif +} + +inline void make_utf16beN_S (const u32 in[4], u32 out1[4], u32 out2[4]) +{ + #ifdef IS_NV + out2[3] = __byte_perm_S (in[3], 0, 0x1707); + out2[2] = __byte_perm_S (in[3], 0, 0x3727); + out2[1] = __byte_perm_S (in[2], 0, 0x1707); + out2[0] = __byte_perm_S (in[2], 0, 0x3727); + out1[3] = __byte_perm_S (in[1], 0, 0x1707); + out1[2] = __byte_perm_S (in[1], 0, 0x3727); + out1[1] = __byte_perm_S (in[0], 0, 0x1707); + out1[0] = __byte_perm_S (in[0], 0, 0x3727); + #endif + + #if defined IS_AMD || defined IS_GENERIC + out2[3] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); + out2[2] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); + out2[1] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); + out2[0] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); + out1[3] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); + out1[2] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); + out1[1] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); + out1[0] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); #endif } @@ -14939,14 +14989,14 @@ inline void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) #endif #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); - out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); - out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); - out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); - out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); - out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); - out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); - out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); + out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); + out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); + out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); + out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); + out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); + out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); + out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); + out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); #endif } diff --git a/OpenCL/inc_hash_md4.cl b/OpenCL/inc_hash_md4.cl index b6204e717..ed6b28dab 100644 --- a/OpenCL/inc_hash_md4.cl +++ b/OpenCL/inc_hash_md4.cl @@ -1339,6 +1339,92 @@ void md4_update_vector (md4_ctx_vector_t *ctx, const u32x *w, const int len) md4_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); } +void md4_update_vector_swap (md4_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md4_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md4_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + void md4_update_vector_utf16le (md4_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -1381,6 +1467,82 @@ void md4_update_vector_utf16le (md4_ctx_vector_t *ctx, const u32x *w, const int md4_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); } +void md4_update_vector_utf16le_swap (md4_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md4_update_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md4_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + void md4_final_vector (md4_ctx_vector_t *ctx) { const int pos = ctx->len & 63; diff --git a/OpenCL/inc_hash_md5.cl b/OpenCL/inc_hash_md5.cl index 692c4a88d..8ac967281 100644 --- a/OpenCL/inc_hash_md5.cl +++ b/OpenCL/inc_hash_md5.cl @@ -1407,6 +1407,92 @@ void md5_update_vector (md5_ctx_vector_t *ctx, const u32x *w, const int len) md5_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); } +void md5_update_vector_swap (md5_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md5_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md5_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + void md5_update_vector_utf16le (md5_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -1449,6 +1535,82 @@ void md5_update_vector_utf16le (md5_ctx_vector_t *ctx, const u32x *w, const int md5_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); } +void md5_update_vector_utf16le_swap (md5_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md5_update_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + md5_update_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + void md5_final_vector (md5_ctx_vector_t *ctx) { const int pos = ctx->len & 63; diff --git a/OpenCL/inc_hash_ripemd160.cl b/OpenCL/inc_hash_ripemd160.cl index 91ab5e42c..8e3d738da 100644 --- a/OpenCL/inc_hash_ripemd160.cl +++ b/OpenCL/inc_hash_ripemd160.cl @@ -1608,6 +1608,92 @@ void ripemd160_update_vector (ripemd160_ctx_vector_t *ctx, const u32x *w, const ripemd160_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); } +void ripemd160_update_vector_swap (ripemd160_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + ripemd160_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + ripemd160_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + void ripemd160_update_vector_utf16le (ripemd160_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -1650,6 +1736,82 @@ void ripemd160_update_vector_utf16le (ripemd160_ctx_vector_t *ctx, const u32x *w ripemd160_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); } +void ripemd160_update_vector_utf16le_swap (ripemd160_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + ripemd160_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + ripemd160_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + void ripemd160_final_vector (ripemd160_ctx_vector_t *ctx) { const int pos = ctx->len & 63; diff --git a/OpenCL/inc_hash_sha1.cl b/OpenCL/inc_hash_sha1.cl index 63f41ec0f..e385abd52 100644 --- a/OpenCL/inc_hash_sha1.cl +++ b/OpenCL/inc_hash_sha1.cl @@ -1477,6 +1477,92 @@ void sha1_update_vector (sha1_ctx_vector_t *ctx, const u32x *w, const int len) sha1_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); } +void sha1_update_vector_swap (sha1_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha1_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha1_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + void sha1_update_vector_utf16le (sha1_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -1519,6 +1605,124 @@ void sha1_update_vector_utf16le (sha1_ctx_vector_t *ctx, const u32x *w, const in sha1_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); } +void sha1_update_vector_utf16le_swap (sha1_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha1_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha1_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +void sha1_update_vector_utf16beN (sha1_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16beN (w1, w2, w3); + make_utf16beN (w0, w0, w1); + + sha1_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16beN (w1, w2, w3); + make_utf16beN (w0, w0, w1); + + sha1_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + void sha1_final_vector (sha1_ctx_vector_t *ctx) { int pos = ctx->len & 63; diff --git a/OpenCL/inc_hash_sha256.cl b/OpenCL/inc_hash_sha256.cl index 9f29c9289..089f0c971 100644 --- a/OpenCL/inc_hash_sha256.cl +++ b/OpenCL/inc_hash_sha256.cl @@ -1430,6 +1430,92 @@ void sha256_update_vector (sha256_ctx_vector_t *ctx, const u32x *w, const int le sha256_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); } +void sha256_update_vector_swap (sha256_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha256_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha256_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + void sha256_update_vector_utf16le (sha256_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -1472,6 +1558,124 @@ void sha256_update_vector_utf16le (sha256_ctx_vector_t *ctx, const u32x *w, cons sha256_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); } +void sha256_update_vector_utf16le_swap (sha256_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha256_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + sha256_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + +void sha256_update_vector_utf16beN (sha256_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16beN (w1, w2, w3); + make_utf16beN (w0, w0, w1); + + sha256_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16beN (w1, w2, w3); + make_utf16beN (w0, w0, w1); + + sha256_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + void sha256_final_vector (sha256_ctx_vector_t *ctx) { int pos = ctx->len & 63; diff --git a/OpenCL/inc_hash_sha384.cl b/OpenCL/inc_hash_sha384.cl index 6ec730aa1..13ac34d0a 100644 --- a/OpenCL/inc_hash_sha384.cl +++ b/OpenCL/inc_hash_sha384.cl @@ -2209,6 +2209,160 @@ void sha384_update_vector (sha384_ctx_vector_t *ctx, const u32x *w, const int le sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - pos1); } +void sha384_update_vector_swap (sha384_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 128; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - pos1); +} + void sha384_update_vector_utf16le (sha384_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -2275,6 +2429,204 @@ void sha384_update_vector_utf16le (sha384_ctx_vector_t *ctx, const u32x *w, cons sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, (len - pos1) * 2); } +void sha384_update_vector_utf16le_swap (sha384_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16le_S (w3, w6, w7); + make_utf16le_S (w2, w4, w5); + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 64 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16le_S (w3, w6, w7); + make_utf16le_S (w2, w4, w5); + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, (len - pos1) * 2); +} + +void sha384_update_vector_utf16beN (sha384_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16beN_S (w3, w6, w7); + make_utf16beN_S (w2, w4, w5); + make_utf16beN_S (w1, w2, w3); + make_utf16beN_S (w0, w0, w1); + + sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 64 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16beN_S (w3, w6, w7); + make_utf16beN_S (w2, w4, w5); + make_utf16beN_S (w1, w2, w3); + make_utf16beN_S (w0, w0, w1); + + sha384_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, (len - pos1) * 2); +} + void sha384_final_vector (sha384_ctx_vector_t *ctx) { int pos = ctx->len & 127; diff --git a/OpenCL/inc_hash_sha512.cl b/OpenCL/inc_hash_sha512.cl index 603f42ee9..3d9a19dd3 100644 --- a/OpenCL/inc_hash_sha512.cl +++ b/OpenCL/inc_hash_sha512.cl @@ -2209,6 +2209,160 @@ void sha512_update_vector (sha512_ctx_vector_t *ctx, const u32x *w, const int le sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - pos1); } +void sha512_update_vector_swap (sha512_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 128; pos1 += 128, pos4 += 32) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 128); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + w4[0] = w[pos4 + 16]; + w4[1] = w[pos4 + 17]; + w4[2] = w[pos4 + 18]; + w4[3] = w[pos4 + 19]; + w5[0] = w[pos4 + 20]; + w5[1] = w[pos4 + 21]; + w5[2] = w[pos4 + 22]; + w5[3] = w[pos4 + 23]; + w6[0] = w[pos4 + 24]; + w6[1] = w[pos4 + 25]; + w6[2] = w[pos4 + 26]; + w6[3] = w[pos4 + 27]; + w7[0] = w[pos4 + 28]; + w7[1] = w[pos4 + 29]; + w7[2] = w[pos4 + 30]; + w7[3] = w[pos4 + 31]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, len - pos1); +} + void sha512_update_vector_utf16le (sha512_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -2275,6 +2429,204 @@ void sha512_update_vector_utf16le (sha512_ctx_vector_t *ctx, const u32x *w, cons sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, (len - pos1) * 2); } +void sha512_update_vector_utf16le_swap (sha512_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16le_S (w3, w6, w7); + make_utf16le_S (w2, w4, w5); + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 64 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16le_S (w3, w6, w7); + make_utf16le_S (w2, w4, w5); + make_utf16le_S (w1, w2, w3); + make_utf16le_S (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + w4[0] = swap32 (w4[0]); + w4[1] = swap32 (w4[1]); + w4[2] = swap32 (w4[2]); + w4[3] = swap32 (w4[3]); + w5[0] = swap32 (w5[0]); + w5[1] = swap32 (w5[1]); + w5[2] = swap32 (w5[2]); + w5[3] = swap32 (w5[3]); + w6[0] = swap32 (w6[0]); + w6[1] = swap32 (w6[1]); + w6[2] = swap32 (w6[2]); + w6[3] = swap32 (w6[3]); + w7[0] = swap32 (w7[0]); + w7[1] = swap32 (w7[1]); + w7[2] = swap32 (w7[2]); + w7[3] = swap32 (w7[3]); + + sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, (len - pos1) * 2); +} + +void sha512_update_vector_utf16beN (sha512_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + u32x w4[4]; + u32x w5[4]; + u32x w6[4]; + u32x w7[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16beN_S (w3, w6, w7); + make_utf16beN_S (w2, w4, w5); + make_utf16beN_S (w1, w2, w3); + make_utf16beN_S (w0, w0, w1); + + sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, 64 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + make_utf16beN_S (w3, w6, w7); + make_utf16beN_S (w2, w4, w5); + make_utf16beN_S (w1, w2, w3); + make_utf16beN_S (w0, w0, w1); + + sha512_update_vector_128 (ctx, w0, w1, w2, w3, w4, w5, w6, w7, (len - pos1) * 2); +} + void sha512_final_vector (sha512_ctx_vector_t *ctx) { int pos = ctx->len & 127; diff --git a/OpenCL/inc_hash_whirlpool.cl b/OpenCL/inc_hash_whirlpool.cl index ed57bc771..6b319b462 100644 --- a/OpenCL/inc_hash_whirlpool.cl +++ b/OpenCL/inc_hash_whirlpool.cl @@ -2717,6 +2717,92 @@ void whirlpool_update_vector (whirlpool_ctx_vector_t *ctx, const u32x *w, const whirlpool_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); } +void whirlpool_update_vector_swap (whirlpool_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 64; pos1 += 64, pos4 += 16) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + whirlpool_update_vector_64 (ctx, w0, w1, w2, w3, 64); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + w2[0] = w[pos4 + 8]; + w2[1] = w[pos4 + 9]; + w2[2] = w[pos4 + 10]; + w2[3] = w[pos4 + 11]; + w3[0] = w[pos4 + 12]; + w3[1] = w[pos4 + 13]; + w3[2] = w[pos4 + 14]; + w3[3] = w[pos4 + 15]; + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + whirlpool_update_vector_64 (ctx, w0, w1, w2, w3, len - pos1); +} + void whirlpool_update_vector_utf16le (whirlpool_ctx_vector_t *ctx, const u32x *w, const int len) { u32x w0[4]; @@ -2759,6 +2845,82 @@ void whirlpool_update_vector_utf16le (whirlpool_ctx_vector_t *ctx, const u32x *w whirlpool_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); } +void whirlpool_update_vector_utf16le_swap (whirlpool_ctx_vector_t *ctx, const u32x *w, const int len) +{ + u32x w0[4]; + u32x w1[4]; + u32x w2[4]; + u32x w3[4]; + + int pos1; + int pos4; + + for (pos1 = 0, pos4 = 0; pos1 < len - 32; pos1 += 32, pos4 += 8) + { + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + whirlpool_update_vector_64 (ctx, w0, w1, w2, w3, 32 * 2); + } + + w0[0] = w[pos4 + 0]; + w0[1] = w[pos4 + 1]; + w0[2] = w[pos4 + 2]; + w0[3] = w[pos4 + 3]; + w1[0] = w[pos4 + 4]; + w1[1] = w[pos4 + 5]; + w1[2] = w[pos4 + 6]; + w1[3] = w[pos4 + 7]; + + make_utf16le (w1, w2, w3); + make_utf16le (w0, w0, w1); + + w0[0] = swap32 (w0[0]); + w0[1] = swap32 (w0[1]); + w0[2] = swap32 (w0[2]); + w0[3] = swap32 (w0[3]); + w1[0] = swap32 (w1[0]); + w1[1] = swap32 (w1[1]); + w1[2] = swap32 (w1[2]); + w1[3] = swap32 (w1[3]); + w2[0] = swap32 (w2[0]); + w2[1] = swap32 (w2[1]); + w2[2] = swap32 (w2[2]); + w2[3] = swap32 (w2[3]); + w3[0] = swap32 (w3[0]); + w3[1] = swap32 (w3[1]); + w3[2] = swap32 (w3[2]); + w3[3] = swap32 (w3[3]); + + whirlpool_update_vector_64 (ctx, w0, w1, w2, w3, (len - pos1) * 2); +} + void whirlpool_final_vector (whirlpool_ctx_vector_t *ctx) { int pos = ctx->len & 63;