From 63f6ca5114efa2e27ab7da126fc414886df64d13 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Tue, 5 Sep 2017 16:45:20 +0200 Subject: [PATCH] Do not use __local memory for whirlpool if running on a device without physical shared memory --- OpenCL/inc_cipher_aes.cl | 6 ---- OpenCL/inc_hash_whirlpool.cl | 30 +++++++++--------- OpenCL/inc_vendor.cl | 6 ++++ OpenCL/m06100_a0-optimized.cl | 20 +++++++++++- OpenCL/m06100_a0.cl | 18 +++++++++++ OpenCL/m06100_a1-optimized.cl | 20 +++++++++++- OpenCL/m06100_a1.cl | 18 +++++++++++ OpenCL/m06100_a3-optimized.cl | 60 +++++++++++++++++++++++++++++++++-- OpenCL/m06100_a3.cl | 18 +++++++++++ OpenCL/m06231.cl | 40 ++++++++++++++++------- OpenCL/m06232.cl | 40 ++++++++++++++++------- OpenCL/m06233.cl | 40 ++++++++++++++++------- 12 files changed, 257 insertions(+), 59 deletions(-) diff --git a/OpenCL/inc_cipher_aes.cl b/OpenCL/inc_cipher_aes.cl index a393a8605..dff681730 100644 --- a/OpenCL/inc_cipher_aes.cl +++ b/OpenCL/inc_cipher_aes.cl @@ -690,12 +690,6 @@ __constant u32a rcon[] = 0x1b000000, 0x36000000, }; -#ifdef REAL_SHM -#define SHM_TYPE __local -#else -#define SHM_TYPE __constant -#endif - // 128 bit key static void aes128_ExpandKey (u32 *ks, const u32 *ukey, SHM_TYPE u32 *s_te0, SHM_TYPE u32 *s_te1, SHM_TYPE u32 *s_te2, SHM_TYPE u32 *s_te3, SHM_TYPE u32 *s_te4) diff --git a/OpenCL/inc_hash_whirlpool.cl b/OpenCL/inc_hash_whirlpool.cl index 371d98a4d..4b3b1c03a 100644 --- a/OpenCL/inc_hash_whirlpool.cl +++ b/OpenCL/inc_hash_whirlpool.cl @@ -1125,12 +1125,12 @@ typedef struct whirlpool_ctx int len; - __local u32 (*s_Ch)[256]; - __local u32 (*s_Cl)[256]; + SHM_TYPE u32 (*s_Ch)[256]; + SHM_TYPE u32 (*s_Cl)[256]; } whirlpool_ctx_t; -static void whirlpool_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32 Kh[8]; u32 Kl[8]; @@ -1300,7 +1300,7 @@ static void whirlpool_transform (const u32 w0[4], const u32 w1[4], const u32 w2[ digest[15] ^= statel[7] ^ w3[3]; } -static void whirlpool_init (whirlpool_ctx_t *ctx, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_init (whirlpool_ctx_t *ctx, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { ctx->h[ 0] = 0; ctx->h[ 1] = 0; @@ -1975,7 +1975,7 @@ typedef struct whirlpool_hmac_ctx } whirlpool_hmac_ctx_t; -static void whirlpool_hmac_init_64 (whirlpool_hmac_ctx_t *ctx, const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init_64 (whirlpool_hmac_ctx_t *ctx, const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32 t0[4]; u32 t1[4]; @@ -2029,7 +2029,7 @@ static void whirlpool_hmac_init_64 (whirlpool_hmac_ctx_t *ctx, const u32 w0[4], whirlpool_update_64 (&ctx->opad, t0, t1, t2, t3, 64); } -static void whirlpool_hmac_init (whirlpool_hmac_ctx_t *ctx, const u32 *w, const int len, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init (whirlpool_hmac_ctx_t *ctx, const u32 *w, const int len, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32 w0[4]; u32 w1[4]; @@ -2086,7 +2086,7 @@ static void whirlpool_hmac_init (whirlpool_hmac_ctx_t *ctx, const u32 *w, const whirlpool_hmac_init_64 (ctx, w0, w1, w2, w3, s_Ch, s_Cl); } -static void whirlpool_hmac_init_swap (whirlpool_hmac_ctx_t *ctx, const u32 *w, const int len, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init_swap (whirlpool_hmac_ctx_t *ctx, const u32 *w, const int len, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32 w0[4]; u32 w1[4]; @@ -2143,7 +2143,7 @@ static void whirlpool_hmac_init_swap (whirlpool_hmac_ctx_t *ctx, const u32 *w, c whirlpool_hmac_init_64 (ctx, w0, w1, w2, w3, s_Ch, s_Cl); } -static void whirlpool_hmac_init_global (whirlpool_hmac_ctx_t *ctx, __global const u32 *w, const int len, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init_global (whirlpool_hmac_ctx_t *ctx, __global const u32 *w, const int len, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32 w0[4]; u32 w1[4]; @@ -2200,7 +2200,7 @@ static void whirlpool_hmac_init_global (whirlpool_hmac_ctx_t *ctx, __global cons whirlpool_hmac_init_64 (ctx, w0, w1, w2, w3, s_Ch, s_Cl); } -static void whirlpool_hmac_init_global_swap (whirlpool_hmac_ctx_t *ctx, __global const u32 *w, const int len, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init_global_swap (whirlpool_hmac_ctx_t *ctx, __global const u32 *w, const int len, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32 w0[4]; u32 w1[4]; @@ -2346,12 +2346,12 @@ typedef struct whirlpool_ctx_vector int len; - __local u32 (*s_Ch)[256]; - __local u32 (*s_Cl)[256]; + SHM_TYPE u32 (*s_Ch)[256]; + SHM_TYPE u32 (*s_Cl)[256]; } whirlpool_ctx_vector_t; -static void whirlpool_transform_vector (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_transform_vector (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32x Kh[8]; u32x Kl[8]; @@ -2521,7 +2521,7 @@ static void whirlpool_transform_vector (const u32x w0[4], const u32x w1[4], cons digest[15] ^= statel[7] ^ w3[3]; } -static void whirlpool_init_vector (whirlpool_ctx_vector_t *ctx, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_init_vector (whirlpool_ctx_vector_t *ctx, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { ctx->h[ 0] = 0; ctx->h[ 1] = 0; @@ -2982,7 +2982,7 @@ typedef struct whirlpool_hmac_ctx_vector } whirlpool_hmac_ctx_vector_t; -static void whirlpool_hmac_init_vector_64 (whirlpool_hmac_ctx_vector_t *ctx, const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init_vector_64 (whirlpool_hmac_ctx_vector_t *ctx, const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32x t0[4]; u32x t1[4]; @@ -3036,7 +3036,7 @@ static void whirlpool_hmac_init_vector_64 (whirlpool_hmac_ctx_vector_t *ctx, con whirlpool_update_vector_64 (&ctx->opad, t0, t1, t2, t3, 64); } -static void whirlpool_hmac_init_vector (whirlpool_hmac_ctx_vector_t *ctx, const u32x *w, const int len, __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +static void whirlpool_hmac_init_vector (whirlpool_hmac_ctx_vector_t *ctx, const u32x *w, const int len, SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { u32x w0[4]; u32x w1[4]; diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index f843fd3bf..2c30f9935 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -25,6 +25,12 @@ #elif DEVICE_TYPE == DEVICE_TYPE_ACCEL #endif +#ifdef REAL_SHM +#define SHM_TYPE __local +#else +#define SHM_TYPE __constant +#endif + /** * vendor specific */ diff --git a/OpenCL/m06100_a0-optimized.cl b/OpenCL/m06100_a0-optimized.cl index 270e0cd34..dcb16a7b9 100644 --- a/OpenCL/m06100_a0-optimized.cl +++ b/OpenCL/m06100_a0-optimized.cl @@ -15,7 +15,7 @@ #include "inc_simd.cl" #include "inc_hash_whirlpool.cl" -void whirlpool_transform_transport_vector (const u32x w[16], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +void whirlpool_transform_transport_vector (const u32x w[16], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { whirlpool_transform_vector (w + 0, w + 4, w + 8, w + 12, digest, s_Ch, s_Cl); } @@ -34,6 +34,8 @@ __kernel void m06100_m04 (__global pw_t *pws, __constant const kernel_rule_t *ru * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -60,6 +62,13 @@ __kernel void m06100_m04 (__global pw_t *pws, __constant const kernel_rule_t *ru barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -165,6 +174,8 @@ __kernel void m06100_s04 (__global pw_t *pws, __constant const kernel_rule_t *ru * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -191,6 +202,13 @@ __kernel void m06100_s04 (__global pw_t *pws, __constant const kernel_rule_t *ru barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** diff --git a/OpenCL/m06100_a0.cl b/OpenCL/m06100_a0.cl index baf8b2c14..e8d0178c9 100644 --- a/OpenCL/m06100_a0.cl +++ b/OpenCL/m06100_a0.cl @@ -29,6 +29,8 @@ __kernel void m06100_mxx (__global pw_t *pws, __constant const kernel_rule_t *ru * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -55,6 +57,13 @@ __kernel void m06100_mxx (__global pw_t *pws, __constant const kernel_rule_t *ru barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -104,6 +113,8 @@ __kernel void m06100_sxx (__global pw_t *pws, __constant const kernel_rule_t *ru * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -130,6 +141,13 @@ __kernel void m06100_sxx (__global pw_t *pws, __constant const kernel_rule_t *ru barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** diff --git a/OpenCL/m06100_a1-optimized.cl b/OpenCL/m06100_a1-optimized.cl index bc7b8b956..6da1a8819 100644 --- a/OpenCL/m06100_a1-optimized.cl +++ b/OpenCL/m06100_a1-optimized.cl @@ -13,7 +13,7 @@ #include "inc_simd.cl" #include "inc_hash_whirlpool.cl" -void whirlpool_transform_transport_vector (const u32x w[16], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +void whirlpool_transform_transport_vector (const u32x w[16], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { whirlpool_transform_vector (w + 0, w + 4, w + 8, w + 12, digest, s_Ch, s_Cl); } @@ -32,6 +32,8 @@ __kernel void m06100_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -58,6 +60,13 @@ __kernel void m06100_m04 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -221,6 +230,8 @@ __kernel void m06100_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -247,6 +258,13 @@ __kernel void m06100_s04 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** diff --git a/OpenCL/m06100_a1.cl b/OpenCL/m06100_a1.cl index 0081e5463..e7697a48b 100644 --- a/OpenCL/m06100_a1.cl +++ b/OpenCL/m06100_a1.cl @@ -27,6 +27,8 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -53,6 +55,13 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -100,6 +109,8 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -126,6 +137,13 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** diff --git a/OpenCL/m06100_a3-optimized.cl b/OpenCL/m06100_a3-optimized.cl index d14c65cfd..291285cc6 100644 --- a/OpenCL/m06100_a3-optimized.cl +++ b/OpenCL/m06100_a3-optimized.cl @@ -13,12 +13,12 @@ #include "inc_simd.cl" #include "inc_hash_whirlpool.cl" -void whirlpool_transform_transport_vector (const u32x w[16], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +void whirlpool_transform_transport_vector (const u32x w[16], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { whirlpool_transform_vector (w + 0, w + 4, w + 8, w + 12, digest, s_Ch, s_Cl); } -void m06100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 (*s_Cl)[256], __local u32 (*s_Ch)[256]) +void m06100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, SHM_TYPE u32 (*s_Cl)[256], SHM_TYPE u32 (*s_Ch)[256]) { /** * modifier @@ -87,7 +87,7 @@ void m06100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __gl } } -void m06100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 (*s_Cl)[256], __local u32 (*s_Ch)[256]) +void m06100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global const kernel_rule_t *rules_buf, __global const pw_t *combs_buf, __global const bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global const u32 *bitmaps_buf_s1_a, __global const u32 *bitmaps_buf_s1_b, __global const u32 *bitmaps_buf_s1_c, __global const u32 *bitmaps_buf_s1_d, __global const u32 *bitmaps_buf_s2_a, __global const u32 *bitmaps_buf_s2_b, __global const u32 *bitmaps_buf_s2_c, __global const u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global const digest_t *digests_buf, __global u32 *hashes_shown, __global const salt_t *salt_bufs, __global const void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV0_buf, __global u32 *d_scryptV1_buf, __global u32 *d_scryptV2_buf, __global u32 *d_scryptV3_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 il_cnt, const u32 digests_cnt, const u32 digests_offset, SHM_TYPE u32 (*s_Cl)[256], SHM_TYPE u32 (*s_Ch)[256]) { /** * modifier @@ -182,6 +182,8 @@ __kernel void m06100_m04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -208,6 +210,13 @@ __kernel void m06100_m04 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -265,6 +274,8 @@ __kernel void m06100_m08 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -291,6 +302,13 @@ __kernel void m06100_m08 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -348,6 +366,8 @@ __kernel void m06100_m16 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -374,6 +394,13 @@ __kernel void m06100_m16 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -431,6 +458,8 @@ __kernel void m06100_s04 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -457,6 +486,13 @@ __kernel void m06100_s04 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -514,6 +550,8 @@ __kernel void m06100_s08 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -540,6 +578,13 @@ __kernel void m06100_s08 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -597,6 +642,8 @@ __kernel void m06100_s16 (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -623,6 +670,13 @@ __kernel void m06100_s16 (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** diff --git a/OpenCL/m06100_a3.cl b/OpenCL/m06100_a3.cl index 72d45fa29..6842cf061 100644 --- a/OpenCL/m06100_a3.cl +++ b/OpenCL/m06100_a3.cl @@ -27,6 +27,8 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -53,6 +55,13 @@ __kernel void m06100_mxx (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** @@ -113,6 +122,8 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -139,6 +150,13 @@ __kernel void m06100_sxx (__global pw_t *pws, __global const kernel_rule_t *rule barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; /** diff --git a/OpenCL/m06231.cl b/OpenCL/m06231.cl index d22e6712c..c978bd38a 100644 --- a/OpenCL/m06231.cl +++ b/OpenCL/m06231.cl @@ -45,7 +45,7 @@ u32 u8add (const u32 a, const u32 b) return r; } -void hmac_whirlpool_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[16], u32x opad[16], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +void hmac_whirlpool_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[16], u32x opad[16], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { digest[ 0] = ipad[ 0]; digest[ 1] = ipad[ 1]; @@ -155,6 +155,8 @@ __kernel void m06231_init (__global pw_t *pws, __global const kernel_rule_t *rul * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -181,6 +183,13 @@ __kernel void m06231_init (__global pw_t *pws, __global const kernel_rule_t *rul barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; u32 w0[4]; @@ -354,6 +363,8 @@ __kernel void m06231_loop (__global pw_t *pws, __global const kernel_rule_t *rul * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -380,6 +391,13 @@ __kernel void m06231_loop (__global pw_t *pws, __global const kernel_rule_t *rul barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if ((gid * VECT_SIZE) >= gid_max) return; u32x ipad[16]; @@ -550,17 +568,17 @@ __kernel void m06231_comp (__global pw_t *pws, __global const kernel_rule_t *rul #ifdef REAL_SHM - __local u32 s_td0[256]; - __local u32 s_td1[256]; - __local u32 s_td2[256]; - __local u32 s_td3[256]; - __local u32 s_td4[256]; + SHM_TYPE u32 s_td0[256]; + SHM_TYPE u32 s_td1[256]; + SHM_TYPE u32 s_td2[256]; + SHM_TYPE u32 s_td3[256]; + SHM_TYPE u32 s_td4[256]; - __local u32 s_te0[256]; - __local u32 s_te1[256]; - __local u32 s_te2[256]; - __local u32 s_te3[256]; - __local u32 s_te4[256]; + SHM_TYPE u32 s_te0[256]; + SHM_TYPE u32 s_te1[256]; + SHM_TYPE u32 s_te2[256]; + SHM_TYPE u32 s_te3[256]; + SHM_TYPE u32 s_te4[256]; for (u32 i = lid; i < 256; i += lsz) { diff --git a/OpenCL/m06232.cl b/OpenCL/m06232.cl index f7a777b6e..eaac27c99 100644 --- a/OpenCL/m06232.cl +++ b/OpenCL/m06232.cl @@ -45,7 +45,7 @@ u32 u8add (const u32 a, const u32 b) return r; } -void hmac_whirlpool_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[16], u32x opad[16], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +void hmac_whirlpool_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[16], u32x opad[16], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { digest[ 0] = ipad[ 0]; digest[ 1] = ipad[ 1]; @@ -155,6 +155,8 @@ __kernel void m06232_init (__global pw_t *pws, __global const kernel_rule_t *rul * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -181,6 +183,13 @@ __kernel void m06232_init (__global pw_t *pws, __global const kernel_rule_t *rul barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; u32 w0[4]; @@ -354,6 +363,8 @@ __kernel void m06232_loop (__global pw_t *pws, __global const kernel_rule_t *rul * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -380,6 +391,13 @@ __kernel void m06232_loop (__global pw_t *pws, __global const kernel_rule_t *rul barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if ((gid * VECT_SIZE) >= gid_max) return; u32x ipad[16]; @@ -550,17 +568,17 @@ __kernel void m06232_comp (__global pw_t *pws, __global const kernel_rule_t *rul #ifdef REAL_SHM - __local u32 s_td0[256]; - __local u32 s_td1[256]; - __local u32 s_td2[256]; - __local u32 s_td3[256]; - __local u32 s_td4[256]; + SHM_TYPE u32 s_td0[256]; + SHM_TYPE u32 s_td1[256]; + SHM_TYPE u32 s_td2[256]; + SHM_TYPE u32 s_td3[256]; + SHM_TYPE u32 s_td4[256]; - __local u32 s_te0[256]; - __local u32 s_te1[256]; - __local u32 s_te2[256]; - __local u32 s_te3[256]; - __local u32 s_te4[256]; + SHM_TYPE u32 s_te0[256]; + SHM_TYPE u32 s_te1[256]; + SHM_TYPE u32 s_te2[256]; + SHM_TYPE u32 s_te3[256]; + SHM_TYPE u32 s_te4[256]; for (u32 i = lid; i < 256; i += lsz) { diff --git a/OpenCL/m06233.cl b/OpenCL/m06233.cl index edd91377d..bcf479528 100644 --- a/OpenCL/m06233.cl +++ b/OpenCL/m06233.cl @@ -45,7 +45,7 @@ u32 u8add (const u32 a, const u32 b) return r; } -void hmac_whirlpool_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[16], u32x opad[16], u32x digest[16], __local u32 (*s_Ch)[256], __local u32 (*s_Cl)[256]) +void hmac_whirlpool_run_V (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[16], u32x opad[16], u32x digest[16], SHM_TYPE u32 (*s_Ch)[256], SHM_TYPE u32 (*s_Cl)[256]) { digest[ 0] = ipad[ 0]; digest[ 1] = ipad[ 1]; @@ -155,6 +155,8 @@ __kernel void m06233_init (__global pw_t *pws, __global const kernel_rule_t *rul * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -181,6 +183,13 @@ __kernel void m06233_init (__global pw_t *pws, __global const kernel_rule_t *rul barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if (gid >= gid_max) return; u32 w0[4]; @@ -354,6 +363,8 @@ __kernel void m06233_loop (__global pw_t *pws, __global const kernel_rule_t *rul * shared */ + #ifdef REAL_SHM + __local u32 s_Ch[8][256]; __local u32 s_Cl[8][256]; @@ -380,6 +391,13 @@ __kernel void m06233_loop (__global pw_t *pws, __global const kernel_rule_t *rul barrier (CLK_LOCAL_MEM_FENCE); + #else + + __constant u32 (*s_Ch)[256] = Ch; + __constant u32 (*s_Cl)[256] = Cl; + + #endif + if ((gid * VECT_SIZE) >= gid_max) return; u32x ipad[16]; @@ -550,17 +568,17 @@ __kernel void m06233_comp (__global pw_t *pws, __global const kernel_rule_t *rul #ifdef REAL_SHM - __local u32 s_td0[256]; - __local u32 s_td1[256]; - __local u32 s_td2[256]; - __local u32 s_td3[256]; - __local u32 s_td4[256]; + SHM_TYPE u32 s_td0[256]; + SHM_TYPE u32 s_td1[256]; + SHM_TYPE u32 s_td2[256]; + SHM_TYPE u32 s_td3[256]; + SHM_TYPE u32 s_td4[256]; - __local u32 s_te0[256]; - __local u32 s_te1[256]; - __local u32 s_te2[256]; - __local u32 s_te3[256]; - __local u32 s_te4[256]; + SHM_TYPE u32 s_te0[256]; + SHM_TYPE u32 s_te1[256]; + SHM_TYPE u32 s_te2[256]; + SHM_TYPE u32 s_te3[256]; + SHM_TYPE u32 s_te4[256]; for (u32 i = lid; i < 256; i += lsz) {