diff --git a/OpenCL/m03200-pure.cl b/OpenCL/m03200-pure.cl index b6dce0a80..b950da688 100644 --- a/OpenCL/m03200-pure.cl +++ b/OpenCL/m03200-pure.cl @@ -299,16 +299,21 @@ __constant u32a c_sbox3[256] = 0xb74e6132, 0xce77e25b, 0x578fdfe3, 0x3ac372e6 }; -#define BF_ROUND(L,R,N) \ -{ \ - u32 tmp; \ - \ - tmp = S0[hc_bfe_S ((L), 24, 8)]; \ - tmp += S1[hc_bfe_S ((L), 16, 8)]; \ - tmp ^= S2[hc_bfe_S ((L), 8, 8)]; \ - tmp += S3[hc_bfe_S ((L), 0, 8)]; \ - \ - (R) ^= tmp ^ P[(N)]; \ +#define BF_ROUND(L,R,N) \ +{ \ + u32 tmp; \ + \ + const u32 r0 = hc_bfe_S ((L), 24, 8); \ + const u32 r1 = hc_bfe_S ((L), 16, 8); \ + const u32 r2 = hc_bfe_S ((L), 8, 8); \ + const u32 r3 = hc_bfe_S ((L), 0, 8); \ + \ + tmp = S0[r0]; \ + tmp += S1[r1]; \ + tmp ^= S2[r2]; \ + tmp += S3[r3]; \ + \ + (R) ^= tmp ^ P[(N)]; \ } #define BF_ENCRYPT(L,R) \ @@ -361,7 +366,7 @@ DECLSPEC void expand_key (u32 *E, u32 *W, const int len) } } -__kernel void m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t)) +__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t)) { /** * base @@ -438,10 +443,10 @@ __kernel void m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t)) * do the key setup */ - __local u32 S0_all[8][256]; - __local u32 S1_all[8][256]; - __local u32 S2_all[8][256]; - __local u32 S3_all[8][256]; + __local u32 S0_all[FIXED_LOCAL_SIZE][256]; + __local u32 S1_all[FIXED_LOCAL_SIZE][256]; + __local u32 S2_all[FIXED_LOCAL_SIZE][256]; + __local u32 S3_all[FIXED_LOCAL_SIZE][256]; __local u32 *S0 = S0_all[lid]; __local u32 *S1 = S1_all[lid]; @@ -580,7 +585,7 @@ __kernel void m03200_init (KERN_ATTR_TMPS (bcrypt_tmp_t)) } } -__kernel void m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t)) +__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t)) { /** * base @@ -607,10 +612,10 @@ __kernel void m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t)) P[i] = tmps[gid].P[i]; } - __local u32 S0_all[8][256]; - __local u32 S1_all[8][256]; - __local u32 S2_all[8][256]; - __local u32 S3_all[8][256]; + __local u32 S0_all[FIXED_LOCAL_SIZE][256]; + __local u32 S1_all[FIXED_LOCAL_SIZE][256]; + __local u32 S2_all[FIXED_LOCAL_SIZE][256]; + __local u32 S3_all[FIXED_LOCAL_SIZE][256]; __local u32 *S0 = S0_all[lid]; __local u32 *S1 = S1_all[lid]; @@ -778,7 +783,7 @@ __kernel void m03200_loop (KERN_ATTR_TMPS (bcrypt_tmp_t)) } } -__kernel void m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t)) +__kernel void __attribute__((reqd_work_group_size(FIXED_LOCAL_SIZE, 1, 1))) m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t)) { /** * base @@ -798,10 +803,10 @@ __kernel void m03200_comp (KERN_ATTR_TMPS (bcrypt_tmp_t)) P[i] = tmps[gid].P[i]; } - __local u32 S0_all[8][256]; - __local u32 S1_all[8][256]; - __local u32 S2_all[8][256]; - __local u32 S3_all[8][256]; + __local u32 S0_all[FIXED_LOCAL_SIZE][256]; + __local u32 S1_all[FIXED_LOCAL_SIZE][256]; + __local u32 S2_all[FIXED_LOCAL_SIZE][256]; + __local u32 S3_all[FIXED_LOCAL_SIZE][256]; __local u32 *S0 = S0_all[lid]; __local u32 *S1 = S1_all[lid]; diff --git a/docs/changes.txt b/docs/changes.txt index 4ae77d52c..f4d3760de 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -43,6 +43,7 @@ ## Improvements ## +- Cracking bcrypt: Use a feedback from the OpenCL runtime to dynamically find out optimal thread count - Bitcoin Wallet: Be more user friendly by allowing a larger data range for ckey and public_key - Building: Updated BUILD.md - My Wallet: Added additional plaintext pattern used in newer versions diff --git a/src/modules/module_03200.c b/src/modules/module_03200.c index ba35a802c..a9d4efc0c 100644 --- a/src/modules/module_03200.c +++ b/src/modules/module_03200.c @@ -72,18 +72,34 @@ u64 module_tmp_size (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED c return tmp_size; } -u32 module_kernel_threads_min (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) +char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra, MAYBE_UNUSED const hashes_t *hashes, MAYBE_UNUSED const hc_device_param_t *device_param) { - const u32 kernel_threads_min = 8; + char *jit_build_options = NULL; - return kernel_threads_min; -} + // this uses some nice feedback effect. + // based on the device_local_mem_size the reqd_work_group_size in the kernel is set to some value + // which is then is read from the opencl host in the kernel_preferred_wgs_multiple1/2/3 result. + // therefore we do not need to set module_kernel_threads_min/max except for CPU, where the threads are set to fixed 1. + // note we need to use device_param->device_local_mem_size - 4 because opencl jit returns with: + // Entry function '...' uses too much shared data (0xc004 bytes, 0xc000 max) + // on my development system. no clue where the 4 bytes are spent. + // I did some research on this and it seems to be related with the datatype. + // For example, if i used u8 instead, there's only 1 byte wasted. -u32 module_kernel_threads_max (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED const user_options_t *user_options, MAYBE_UNUSED const user_options_extra_t *user_options_extra) -{ - const u32 kernel_threads_max = 8; + u32 fixed_local_size = 0; - return kernel_threads_max; + if (device_param->device_type & CL_DEVICE_TYPE_CPU) + { + fixed_local_size = 1; + } + else + { + fixed_local_size = (device_param->device_local_mem_size - 4) / 4096; + } + + hc_asprintf (&jit_build_options, "-D FIXED_LOCAL_SIZE=%u", fixed_local_size); + + return jit_build_options; } int module_hash_decode (MAYBE_UNUSED const hashconfig_t *hashconfig, MAYBE_UNUSED void *digest_buf, MAYBE_UNUSED salt_t *salt, MAYBE_UNUSED void *esalt_buf, MAYBE_UNUSED void *hook_salt_buf, MAYBE_UNUSED hashinfo_t *hash_info, const char *line_buf, MAYBE_UNUSED const int line_len) @@ -236,14 +252,14 @@ void module_init (module_ctx_t *module_ctx) module_ctx->module_hook23 = MODULE_DEFAULT; module_ctx->module_hook_salt_size = MODULE_DEFAULT; module_ctx->module_hook_size = MODULE_DEFAULT; - module_ctx->module_jit_build_options = MODULE_DEFAULT; + module_ctx->module_jit_build_options = module_jit_build_options; module_ctx->module_jit_cache_disable = MODULE_DEFAULT; module_ctx->module_kernel_accel_max = MODULE_DEFAULT; module_ctx->module_kernel_accel_min = MODULE_DEFAULT; module_ctx->module_kernel_loops_max = MODULE_DEFAULT; module_ctx->module_kernel_loops_min = MODULE_DEFAULT; - module_ctx->module_kernel_threads_max = module_kernel_threads_max; - module_ctx->module_kernel_threads_min = module_kernel_threads_min; + module_ctx->module_kernel_threads_max = MODULE_DEFAULT; + module_ctx->module_kernel_threads_min = MODULE_DEFAULT; module_ctx->module_kern_type = module_kern_type; module_ctx->module_kern_type_dynamic = MODULE_DEFAULT; module_ctx->module_opti_type = module_opti_type;