diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index 8f0d6db..e988cfd 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/zcoinofficial/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.2.9-djm34-beta"
+#define PACKAGE_VERSION "1.2.9-djm34"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
diff --git a/configure.ac b/configure.ac
index aaf8706..51bb16b 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.2.9L-djm34-beta], [], [ccminer], [http://github.com/zcoinofficial/ccminer])
+AC_INIT([ccminer], [1.2.9L-djm34], [], [ccminer], [http://github.com/zcoinofficial/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cuda_mtp/cuda_mtp_forlib.cu b/cuda_mtp/cuda_mtp_forlib.cu
index fd8b48a..be2e6dd 100644
--- a/cuda_mtp/cuda_mtp_forlib.cu
+++ b/cuda_mtp/cuda_mtp_forlib.cu
@@ -986,7 +986,7 @@ uint32_t get_tpb_mtp(int thr_id)
 
 
 __host__
-void mtp_setBlockTarget(int thr_id, const void* pDataIn, const void *pTargetIn, const void * zElement)
+void mtp_setBlockTarget_old(int thr_id, const void* pDataIn, const void *pTargetIn, const void * zElement)
 {
 //	cudaSetDevice(device_map[thr_id]);
 
@@ -997,7 +997,7 @@ void mtp_setBlockTarget(int thr_id, const void* pDataIn, const void *pTargetIn,
 }
 
 __host__
-void mtp_setBlockTarget_test(int thr_id, const void* pDataIn, const void *pTargetIn, const void * zElement,cudaStream_t s0)
+void mtp_setBlockTarget(int thr_id, const void* pDataIn, const void *pTargetIn, const void * zElement,cudaStream_t s0)
 {
 	//	cudaSetDevice(device_map[thr_id]);
 
@@ -1024,12 +1024,11 @@ void mtp_fill(uint32_t dev_id, const uint64_t *Block, uint32_t offset, uint32_t
 }
 
 __host__
-uint32_t mtp_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce)
+uint32_t mtp_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,cudaStream_t s0)
 {
 //	cudaSetDevice(device_map[thr_id]);
 	uint32_t result = UINT32_MAX;
-	CUDA_SAFE_CALL(cudaMemset(d_MinNonces[thr_id], 0xff, sizeof(uint32_t)));
-//	int dev_id = device_map[thr_id % MAX_GPUS];
+	cudaMemsetAsync(d_MinNonces[thr_id], 0xff, sizeof(uint32_t), s0);
 
 
 	uint32_t tpb = TPB_MTP; //TPB52;
@@ -1040,11 +1039,10 @@ uint32_t mtp_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce)
 	dim3 blockyloop(tpb);
 
 	//yloop_init <<<gridyloop, blockyloop>>>(thr_id, threads, startNounce, GYLocal[thr_id]);
+	cudaStreamSynchronize(s0);
 
 	mtp_yloop << < gridyloop, blockyloop >> >(thr_id, threads, startNounce, (Type*)HBlock[thr_id],  d_MinNonces[thr_id]);
-
-
-	CUDA_SAFE_CALL(cudaMemcpy(h_MinNonces[thr_id], d_MinNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	cudaStreamSynchronize(s0);
 
 	result = *h_MinNonces[thr_id];
 	return result; 
@@ -1785,11 +1783,11 @@ __global__  void mtp_fc2(uint32_t threads, uint4  *  DBlock, uint2 *a) {
 
 
 
-__host__ void get_tree(int thr_id, uint8_t* d) {
+__host__ void get_tree_old(int thr_id, uint8_t* d) {
 	CUDA_SAFE_CALL(cudaMemcpy(d, buffer_a[thr_id], sizeof(uint2) * 2 * 1048576 * 4, cudaMemcpyDeviceToHost));
 }
 
-__host__ void get_tree_test(int thr_id, uint8_t* d, cudaStream_t s0) {
+__host__ void get_tree(int thr_id, uint8_t* d, cudaStream_t s0) {
 	CUDA_SAFE_CALL(cudaMemcpyAsync(d, buffer_a[thr_id], sizeof(uint2) * 2 * 1048576 * 4, cudaMemcpyDeviceToHost, s0));
 }
 
@@ -1913,7 +1911,7 @@ void mtp_fill_1b(int thr_id, uint64_t *Block, uint32_t block_nr)
 }
 
 __host__
-void mtp_fill_1c_test(int thr_id, uint64_t *Block, uint32_t block_nr, cudaStream_t s0)
+void mtp_fill_1c(int thr_id, uint64_t *Block, uint32_t block_nr, cudaStream_t s0)
 {
 //	cudaSetDevice(device_map[thr_id]);
 	//	uint4 *Blockptr = &HBlock[thr_id][block_nr * 64];
@@ -1934,7 +1932,7 @@ void mtp_fill_1c_test(int thr_id, uint64_t *Block, uint32_t block_nr, cudaStream
 }
 
 __host__
-void mtp_fill_1c(int thr_id, uint64_t *Block, uint32_t block_nr)
+void mtp_fill_1c_old(int thr_id, uint64_t *Block, uint32_t block_nr)
 {
 	//	cudaSetDevice(device_map[thr_id]);
 	//	uint4 *Blockptr = &HBlock[thr_id][block_nr * 64];
diff --git a/cuda_mtp/mtp-tcr.cu b/cuda_mtp/mtp-tcr.cu
index 2ac2f0d..569c09c 100644
--- a/cuda_mtp/mtp-tcr.cu
+++ b/cuda_mtp/mtp-tcr.cu
@@ -13,22 +13,13 @@
 #define memcost 4*1024*1024
 
 extern void mtp_cpu_init(int thr_id, uint32_t threads);
-
 extern uint32_t mtptcr_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, cudaStream_t s0);
-
-extern void mtp_setBlockTarget(int thr_id,const void* pDataIn, const void *pTargetIn, const void * zElement);
-extern void mtp_setBlockTarget_test(int thr_id, const void* pDataIn, const void *pTargetIn, const void * zElement,cudaStream_t s0);
-extern void mtp_fill(uint32_t d, const uint64_t *Block, uint32_t offset, uint32_t datachunk);
-extern void mtp_fill_1b(int thr_id, uint64_t *Block, uint32_t block_nr);
+extern void mtp_setBlockTarget(int thr_id, const void* pDataIn, const void *pTargetIn, const void * zElement,cudaStream_t s0);
 extern uint32_t get_tpb_mtp(int thr_id);
-
-extern void mtp_fill_1c(int thr_id, uint64_t *Block, uint32_t block_nr);
-extern void mtp_fill_1c_test(int thr_id, uint64_t *Block, uint32_t block_nr, cudaStream_t s0);
-//extern void mtp_i_cpu(int thr_id, uint32_t *block_header);
-extern void mtp_i_cpu(int thr_id, uint32_t *block_header, cudaStream_t s0);
+extern void mtp_fill_1c(int thr_id, uint64_t *Block, uint32_t block_nr, cudaStream_t s0);
 extern void mtp_i_cpu2(int thr_id, uint32_t *block_header, cudaStream_t s0);
-void get_tree(int thr_id, uint8_t* d);
-void get_tree_test(int thr_id, uint8_t* d, cudaStream_t s0);
+void get_tree(int thr_id, uint8_t* d, cudaStream_t s0);
+
 #define HASHLEN 32
 #define SALTLEN 16
 #define PWD "password"
@@ -60,7 +51,7 @@ extern "C" int scanhash_mtptcr(int nthreads,int thr_id, struct work* work, uint3
 //if (JobId==0)
 //	pthread_barrier_init(&barrier, NULL, nthreads);
 
-		cudaStream_t s0;
+	cudaStream_t s0;
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 
@@ -131,18 +122,18 @@ if (JobId[thr_id] != work->data[16] || XtraNonce2[thr_id] != ((uint64_t*)work->x
 context[thr_id] = init_argon2d_param((const char*)endiandata);
 
 argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0,s0);
-	mtp_fill_1c_test(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1,s0);
 
 	mtp_i_cpu2(thr_id, instance[thr_id].block_header,s0);
 
-	get_tree_test(thr_id,dx[thr_id],s0);
+	get_tree(thr_id,dx[thr_id],s0);
 
 	cudaStreamSynchronize(s0);
 
@@ -154,7 +145,7 @@ argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 
 	std::copy(root.begin(), root.end(), TheMerkleRoot[thr_id]);
 
-	mtp_setBlockTarget_test(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id],s0);
+	mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id],s0);
 
 	root.resize(0);
 }
@@ -185,7 +176,7 @@ argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 			blockS nBlockMTP[MTP_L *2] = {0};
 			unsigned char nProofMTP[MTP_L * 3 * 353 ] = {0};
 
-			uint32_t is_sol = mtptcr_solver_test(thr_id,foundNonce, &instance[thr_id], nBlockMTP,nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata,TheUint256Target[0],s0);
+			uint32_t is_sol = mtptcr_solver(thr_id,foundNonce, &instance[thr_id], nBlockMTP,nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata,TheUint256Target[0],s0);
 
 			if (JobId[thr_id] != work->data[16] || XtraNonce2[thr_id] != ((uint64_t*)work->xnonce2)[0])
 				return 0; // if work has changed stop and go back to the initialization
@@ -318,19 +309,20 @@ extern "C" int scanhash_mtptcr_solo(int nthreads, int thr_id, struct work* work,
 
 		argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 
-		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1,s0);
 
 		mtp_i_cpu2(thr_id, instance[thr_id].block_header,s0);
 
-		get_tree(thr_id, dx[thr_id]);
+		get_tree(thr_id, dx[thr_id],s0);
 
+		cudaStreamSynchronize(s0);
 		//	printf("Step 2 : Compute the root Φ of the Merkle hash tree \n");
 		//  sleep(10);
 
@@ -342,7 +334,7 @@ extern "C" int scanhash_mtptcr_solo(int nthreads, int thr_id, struct work* work,
 
 		std::copy(root.begin(), root.end(), TheMerkleRoot[thr_id]);
 
-		mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id]);
+		mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id],s0);
 		root.resize(0);
 	}
 
@@ -371,7 +363,7 @@ extern "C" int scanhash_mtptcr_solo(int nthreads, int thr_id, struct work* work,
 			blockS nBlockMTP[MTP_L * 2] = { 0 };
 			unsigned char nProofMTP[MTP_L * 3 * 353] = { 0 };
 
-			uint32_t is_sol = mtptcr_solver(thr_id, foundNonce, &instance[thr_id], nBlockMTP, nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata, TheUint256Target[0]);
+			uint32_t is_sol = mtptcr_solver(thr_id, foundNonce, &instance[thr_id], nBlockMTP, nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata, TheUint256Target[0],s0);
 
 			if (is_sol == 1 /*&& fulltest(vhash64, ptarget)*/) {
 
diff --git a/cuda_mtp/mtp.cu b/cuda_mtp/mtp.cu
index 17d4bec..7be0335 100644
--- a/cuda_mtp/mtp.cu
+++ b/cuda_mtp/mtp.cu
@@ -10,20 +10,13 @@
 #define memcost 4*1024*1024
 
 extern void mtp_cpu_init(int thr_id, uint32_t threads);
-
-extern uint32_t mtp_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce);
-
-extern void mtp_setBlockTarget(int thr_id,const void* pDataIn, const void *pTargetIn, const void * zElement);
-extern void mtp_fill(uint32_t d, const uint64_t *Block, uint32_t offset, uint32_t datachunk);
-extern void mtp_fill_1b(int thr_id, uint64_t *Block, uint32_t block_nr);
+extern uint32_t mtp_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, cudaStream_t s0);
+extern void mtp_setBlockTarget(int thr_id,const void* pDataIn, const void *pTargetIn, const void * zElement, cudaStream_t s0);
 extern uint32_t get_tpb_mtp(int thr_id);
-
-extern void mtp_fill_1c(int thr_id, uint64_t *Block, uint32_t block_nr);
-
-//extern void mtp_i_cpu(int thr_id, uint32_t *block_header);
-extern void mtp_i_cpu(int thr_id, uint32_t *block_header, cudaStream_t s0);
+extern void mtp_fill_1c(int thr_id, uint64_t *Block, uint32_t block_nr, cudaStream_t s0);
 extern void mtp_i_cpu2(int thr_id, uint32_t *block_header, cudaStream_t s0);
-void get_tree(int thr_id, uint8_t* d);
+void get_tree(int thr_id, uint8_t* d, cudaStream_t s0);
+
 #define HASHLEN 32
 #define SALTLEN 16
 #define PWD "password" 
@@ -51,11 +44,11 @@ extern "C" int scanhash_mtp(int nthreads,int thr_id, struct work* work, uint32_t
 {
 
 	unsigned char mtpHashValue[32];
-	cudaStream_t s0;
+
 //if (JobId==0)
 //	pthread_barrier_init(&barrier, NULL, nthreads);
 
-
+	cudaStream_t s0;
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 
@@ -124,22 +117,20 @@ if (JobId[thr_id] != work->data[16] || XtraNonce2[thr_id] != ((uint64_t*)work->x
 context[thr_id] = init_argon2d_param((const char*)endiandata);
 
 argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0);
-	mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0,s0);
+	mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1,s0);
 
 	mtp_i_cpu2(thr_id, instance[thr_id].block_header,s0);
 
+	get_tree(thr_id,dx[thr_id],s0);
 
-
-	get_tree(thr_id,dx[thr_id]);
-
-
+	cudaStreamSynchronize(s0);
 //  sleep(10);
 
 	ordered_tree[thr_id] = new MerkleTree(dx[thr_id], true);
@@ -150,7 +141,7 @@ argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 
 	std::copy(root.begin(), root.end(), TheMerkleRoot[thr_id]);
 
-	mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id]);
+	mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id],s0);
 	root.resize(0);
 }
 
@@ -162,7 +153,7 @@ argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 //		cudaProfilerStart();
 //		cudaProfilerStop();
 		*hashes_done = pdata[19] - first_nonce + throughput;
-		foundNonce = mtp_cpu_hash_32(thr_id, throughput, pdata[19]);
+		foundNonce = mtp_cpu_hash_32(thr_id, throughput, pdata[19],s0);
 //		cudaProfilerStop();
 		uint32_t _ALIGN(64) vhash64[8];
 		if (foundNonce != UINT32_MAX)
@@ -181,7 +172,7 @@ argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 			blockS nBlockMTP[MTP_L *2] = {0};
 			unsigned char nProofMTP[MTP_L * 3 * 353 ] = {0};
 
-			uint32_t is_sol = mtp_solver(thr_id,foundNonce, &instance[thr_id], nBlockMTP,nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata,TheUint256Target[0]);
+			uint32_t is_sol = mtp_solver(thr_id,foundNonce, &instance[thr_id], nBlockMTP,nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata,TheUint256Target[0],s0);
 
 			if (is_sol==1 /*&& fulltest(vhash64, ptarget)*/) {
 
@@ -315,22 +306,22 @@ extern "C" int scanhash_mtp_solo(int nthreads, int thr_id, struct work* work, ui
 
 		argon2_ctx_from_mtp(&context[thr_id], &instance[thr_id]);
 
-		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0);
-		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 0].v, 0 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[0 + 1].v, 0 + 1,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 0].v, 1048576 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[2 + 1].v, 1048576 + 1,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 0].v, 2097152 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[4 + 1].v, 2097152 + 1,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 0].v, 3145728 + 0,s0);
+		mtp_fill_1c(thr_id, instance[thr_id].memory[6 + 1].v, 3145728 + 1,s0);
 
 		mtp_i_cpu2(thr_id, instance[thr_id].block_header,s0);
 
-		get_tree(thr_id, dx[thr_id]);
+		get_tree(thr_id, dx[thr_id],s0);
 
 		//	printf("Step 2 : Compute the root Φ of the Merkle hash tree \n");
 		//  sleep(10);
-
+		cudaStreamSynchronize(s0);
 		ordered_tree[thr_id] = new MerkleTree(dx[thr_id], true);
 
 		JobId[thr_id] = work->data[17];
@@ -339,7 +330,7 @@ extern "C" int scanhash_mtp_solo(int nthreads, int thr_id, struct work* work, ui
 
 		std::copy(root.begin(), root.end(), TheMerkleRoot[thr_id]);
 
-		mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id]);
+		mtp_setBlockTarget(thr_id, endiandata, ptarget, &TheMerkleRoot[thr_id],s0);
 		root.resize(0);
 	}
 
@@ -355,7 +346,7 @@ extern "C" int scanhash_mtp_solo(int nthreads, int thr_id, struct work* work, ui
 		uint32_t foundNonce;
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
-		foundNonce = mtp_cpu_hash_32(thr_id, throughput, pdata[19]);
+		foundNonce = mtp_cpu_hash_32(thr_id, throughput, pdata[19],s0);
 		
 		uint32_t _ALIGN(64) vhash64[8];
 		if (foundNonce != UINT32_MAX)
@@ -368,7 +359,7 @@ extern "C" int scanhash_mtp_solo(int nthreads, int thr_id, struct work* work, ui
 			blockS nBlockMTP[MTP_L * 2] = { 0 };
 			unsigned char nProofMTP[MTP_L * 3 * 353] = { 0 };
 
-			uint32_t is_sol = mtp_solver(thr_id, foundNonce, &instance[thr_id], nBlockMTP, nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata, TheUint256Target[0]);
+			uint32_t is_sol = mtp_solver(thr_id, foundNonce, &instance[thr_id], nBlockMTP, nProofMTP, TheMerkleRoot[thr_id], mtpHashValue, *ordered_tree[thr_id], endiandata, TheUint256Target[0],s0);
 
 			if (is_sol == 1 /*&& fulltest(vhash64, ptarget)*/) {
 
diff --git a/merkletree/mtp.cpp b/merkletree/mtp.cpp
index 64113e0..30b53a9 100644
--- a/merkletree/mtp.cpp
+++ b/merkletree/mtp.cpp
@@ -697,7 +697,7 @@ MerkleTree TheTree,uint32_t* input, uint256 hashTarget) {
 
 
 
-int mtp_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
+int mtp_solver_old(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char* nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
 	MerkleTree TheTree, uint32_t* input, uint256 hashTarget) {
 
@@ -895,7 +895,7 @@ int mtp_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 
 
 
-int mtptcr_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
+int mtptcr_solver_old(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char* nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
 	MerkleTree TheTree, uint32_t* input, uint256 hashTarget) {
 
@@ -1090,10 +1090,203 @@ int mtptcr_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	return 0;
 }
 
+int mtp_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
+	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char* nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
+	MerkleTree TheTree, uint32_t* input, uint256 hashTarget, cudaStream_t s0) {
+
+	const uint8_t L = 64;
+
+	if (instance != NULL) {
+		//		input[19]=0x01000000;
+		uint256 Y[L + 1];
+		//		std::string proof_blocks[L * 3];
+		memset(&Y, 0, sizeof(Y));
+		uint8_t zero[32] = { 0 };
+		ablake2b_state BlakeHash;
+		ablake2b_init(&BlakeHash, 32);
+
+
+
+
+		ablake2b_update(&BlakeHash, (unsigned char*)&input[0], 80);
+		ablake2b_update(&BlakeHash, (unsigned char*)&resultMerkleRoot[0], 16);
+		ablake2b_update(&BlakeHash, &TheNonce, sizeof(unsigned int));
+		ablake2b_final(&BlakeHash, (unsigned char*)&Y[0], 32);
+
+
+
+		blockS blocks[L * 2];
+
+		///////////////////////////////
+		bool init_blocks = false;
+		bool unmatch_block = false;
+		unsigned char proof_ser[1000] = { 0 };
+		unsigned int proof_size;
+		for (uint8_t j = 1; j <= L; j++) {
+
+			uint32_t ij = (((uint32_t*)(&Y[j - 1]))[0]) % (instance->context_ptr->m_cost);
+			uint32_t except_index = (uint32_t)(instance->context_ptr->m_cost / instance->context_ptr->lanes);
+			if (ij %except_index == 0 || ij%except_index == 1) {
+				init_blocks = true;
+				break;
+			}
+
+			uint32_t prev_index;
+			uint32_t ref_index;
+			getblockindex_test(thr_id, ij, instance, &prev_index, &ref_index,s0);
+
+			//			copy_blockS(&nBlockMTP[j * 2 - 2], &instance->memory[prev_index]);
+			get_block_test(thr_id, /*(uint8_t*)*/nBlockMTP[j * 2 - 2].v, prev_index,s0);
+			//ref block
+			//			copy_blockS(&nBlockMTP[j * 2 - 1], &instance->memory[ref_index]);
+			get_block_test(thr_id, /*(uint8_t*)*/nBlockMTP[j * 2 - 1].v, ref_index,s0);
+			block blockhash;
+			uint8_t blockhash_bytes[ARGON2_BLOCK_SIZE];
+			//			copy_block(&blockhash, &instance->memory[ij]);
+			get_block_test(thr_id, /*(uint8_t*)*/&blockhash.v, ij,s0);
+
 
+			store_block(&blockhash_bytes, &blockhash);
 
+			ablake2b_state BlakeHash2;
+			ablake2b_init(&BlakeHash2, 32);
+			ablake2b_update(&BlakeHash2, &Y[j - 1], sizeof(uint256));
+			ablake2b_update(&BlakeHash2, blockhash_bytes, ARGON2_BLOCK_SIZE);
+			ablake2b_final(&BlakeHash2, (unsigned char*)&Y[j], 32);
+			////////////////////////////////////////////////////////////////
+			// current block
+			clear_internal_memory(blockhash.v, ARGON2_BLOCK_SIZE);
+			clear_internal_memory(blockhash_bytes, ARGON2_BLOCK_SIZE);
 
-int mtptcr_solver_test(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
+			unsigned char curr[32] = { 0 };
+			block blockhash_curr;
+			uint8_t blockhash_curr_bytes[ARGON2_BLOCK_SIZE];
+			//			copy_block(&blockhash_curr, &instance->memory[ij]);
+			get_block_test(thr_id, /*(uint8_t*)*/&blockhash_curr.v, ij,s0);
+			store_block(&blockhash_curr_bytes, &blockhash_curr);
+			ablake2b_state state_curr;
+			ablake2b_init(&state_curr, MERKLE_TREE_ELEMENT_SIZE_B);
+			ablake2b4rounds_update(&state_curr, blockhash_curr_bytes, ARGON2_BLOCK_SIZE);
+			uint8_t digest_curr[MERKLE_TREE_ELEMENT_SIZE_B];
+			ablake2b4rounds_final(&state_curr, digest_curr, sizeof(digest_curr));
+			MerkleTree::Buffer hash_curr = MerkleTree::Buffer(digest_curr, digest_curr + sizeof(digest_curr));
+			clear_internal_memory(blockhash_curr.v, ARGON2_BLOCK_SIZE);
+			clear_internal_memory(blockhash_curr_bytes, ARGON2_BLOCK_SIZE);
+
+
+			std::deque<std::vector<uint8_t>> zProofMTP = TheTree.getProofOrdered(hash_curr, ij + 1);
+
+			nProofMTP[(j * 3 - 3) * 353] = (unsigned char)(zProofMTP.size());
+
+			int k1 = 0;
+			for (const std::vector<uint8_t> &mtpData : zProofMTP) {
+				std::copy(mtpData.begin(), mtpData.end(), nProofMTP + ((j * 3 - 3) * 353 + 1 + k1 * mtpData.size()));
+				k1++;
+			}
+
+			//prev proof
+			unsigned char prev[32] = { 0 };
+			block blockhash_prev;
+			uint8_t blockhash_prev_bytes[ARGON2_BLOCK_SIZE];
+			//			copy_block(&blockhash_prev, &instance->memory[prev_index]);
+			get_block_test(thr_id, /*(uint8_t*)*/&blockhash_prev.v, prev_index,s0);
+			store_block(&blockhash_prev_bytes, &blockhash_prev);
+			ablake2b_state state_prev;
+			ablake2b_init(&state_prev, MERKLE_TREE_ELEMENT_SIZE_B);
+			ablake2b4rounds_update(&state_prev, blockhash_prev_bytes, ARGON2_BLOCK_SIZE);
+			uint8_t digest_prev[MERKLE_TREE_ELEMENT_SIZE_B];
+
+
+			ablake2b4rounds_final(&state_prev, digest_prev, sizeof(digest_prev));
+
+
+			MerkleTree::Buffer hash_prev = MerkleTree::Buffer(digest_prev, digest_prev + sizeof(digest_prev));
+			clear_internal_memory(blockhash_prev.v, ARGON2_BLOCK_SIZE);
+			clear_internal_memory(blockhash_prev_bytes, ARGON2_BLOCK_SIZE);
+
+			std::deque<std::vector<uint8_t>> zProofMTP2 = TheTree.getProofOrdered(hash_prev, prev_index + 1);
+
+			nProofMTP[(j * 3 - 2) * 353] = (unsigned char)(zProofMTP2.size());
+
+			int k2 = 0;
+			for (const std::vector<uint8_t> &mtpData : zProofMTP2) {
+				std::copy(mtpData.begin(), mtpData.end(), nProofMTP + ((j * 3 - 2) * 353 + 1 + k2 * mtpData.size()));
+				k2++;
+			}
+
+
+			//ref proof
+			unsigned char ref[32] = { 0 };
+			block blockhash_ref;
+			uint8_t blockhash_ref_bytes[ARGON2_BLOCK_SIZE];
+			//			copy_block(&blockhash_ref, &instance->memory[ref_index]);
+			get_block_test(thr_id, /*(uint8_t*)*/&blockhash_ref.v, ref_index,s0);
+			store_block(&blockhash_ref_bytes, &blockhash_ref);
+			ablake2b_state state_ref;
+			ablake2b_init(&state_ref, MERKLE_TREE_ELEMENT_SIZE_B);
+			ablake2b4rounds_update(&state_ref, blockhash_ref_bytes, ARGON2_BLOCK_SIZE);
+			uint8_t digest_ref[MERKLE_TREE_ELEMENT_SIZE_B];
+			ablake2b4rounds_final(&state_ref, digest_ref, sizeof(digest_ref));
+			MerkleTree::Buffer hash_ref = MerkleTree::Buffer(digest_ref, digest_ref + sizeof(digest_ref));
+			clear_internal_memory(blockhash_ref.v, ARGON2_BLOCK_SIZE);
+			clear_internal_memory(blockhash_ref_bytes, ARGON2_BLOCK_SIZE);
+
+			std::deque<std::vector<uint8_t>> zProofMTP3 = TheTree.getProofOrdered(hash_ref, ref_index + 1);
+
+			nProofMTP[(j * 3 - 1) * 353] = (unsigned char)(zProofMTP3.size());
+
+			int k3 = 0;
+			for (const std::vector<uint8_t> &mtpData : zProofMTP3) {
+				std::copy(mtpData.begin(), mtpData.end(), nProofMTP + ((j * 3 - 1) * 353 + 1 + k3 * mtpData.size()));
+				k3++;
+			}
+
+
+			/////////////////////////////////////////////////////////////////////
+			/////////////////////////////////////////////////////////////////////
+		}
+
+		if (init_blocks) {
+
+			return 0;
+		}
+
+
+		char hex_tmp[64];
+
+		if (Y[L] > hashTarget) {
+			// Found a solution
+			printf("False positive. Nonce=%08x Hash:", TheNonce);
+			for (int n = 0; n < 32; n++) {
+				printf("%02x", ((unsigned char*)&Y[0])[n]);
+			}
+			printf("\n");
+		}
+		else {
+			for (int i = 0; i<32; i++)
+				mtpHashValue[i] = (((unsigned char*)(&Y[L]))[i]);
+
+			// Found a solution
+			/*
+			printf("Found a solution. Nonce=%08x Hash:", TheNonce);
+			for (int n = 0; n < 32; n++) {
+			printf("%02x", ((unsigned char*)&Y[L])[n]);
+			}
+			printf("\n");
+			*/
+			return 1;
+
+
+		}
+
+	}
+
+
+	return 0;
+}
+
+
+int mtptcr_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char* nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
 	MerkleTree TheTree, uint32_t* input, uint256 hashTarget,cudaStream_t s0 ) {
 
diff --git a/merkletree/mtp.h b/merkletree/mtp.h
index fe62ff2..b7a5027 100644
--- a/merkletree/mtp.h
+++ b/merkletree/mtp.h
@@ -69,17 +69,17 @@ int mtp_solver_orig(uint32_t TheNonce, argon2_instance_t *instance,
 
 int mtp_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char *nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
-	MerkleTree TheTree, uint32_t* input, uint256 hashTarget);
+	MerkleTree TheTree, uint32_t* input, uint256 hashTarget,cudaStream_t s0);
 
 //int mtp_solver_test(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 //	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char *nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
 //	MerkleTree TheTree, uint32_t* input, uint256 hashTarget);
 
-int mtptcr_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
+int mtptcr_solver_old(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char* nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
 	MerkleTree TheTree, uint32_t* input, uint256 hashTarget);
 
-int mtptcr_solver_test(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
+int mtptcr_solver(int thr_id, uint32_t TheNonce, argon2_instance_t *instance,
 	blockS *nBlockMTP /*[72 * 2][128]*/, unsigned char* nProofMTP, unsigned char* resultMerkleRoot, unsigned char* mtpHashValue,
 	MerkleTree TheTree, uint32_t* input, uint256 hashTarget,cudaStream_t s0);
 
diff --git a/run-tcr.cmd b/run-tcr.cmd
index 4a5df59..ec2483f 100644
--- a/run-tcr.cmd
+++ b/run-tcr.cmd
@@ -1,8 +1,9 @@
 rem solo mining:
 rem x64\Release\ccminer -a mtp-tcr -o  http://192.168.0.171:8382   -u djm34 -p password --coinbase-addr TPkxM1Aw872FL9gs4udCDzy5hAG7M7sVSE --no-getwork --no-stratum --quiet -i 20
 rem pool mining:
-   x64\Release\ccminer -a mtp-tcr -o stratum+tcp://pool.tecracoin.io:4556 -u TPkxM1Aw872FL9gs4udCDzy5hAG7M7sVSE -p 0,minpayout=0.6  -i 26 
+   x64\Release\ccminer -a mtp-tcr -o stratum+tcp://pool.tecracoin.io:4556 -u TPkxM1Aw872FL9gs4udCDzy5hAG7M7sVSE -p 0,minpayout=0.6  -i 26
 rem   x64\Release\ccminer -a mtp-tcr -o stratum+tcp://dev.pool.tecracoin.io:4557 -u GJVkKsPFdBsavo5wn5WGiCSScgFZE2F27C  -p 0,d=0.128,minpayout=0.6 --cpu-affinity 2
+