diff --git a/kernel.cu b/kernel.cu
index 532bc24..836379f 100644
--- a/kernel.cu
+++ b/kernel.cu
@@ -34,29 +34,10 @@
 #endif
 #define ROTR64(x, n)  (((x) >> (n)) | ((x) << (64 - (n))))
 #define ROTR(x,n) ROTR64(x,n)
-#define ROTL64(x, n)  (((x) << (n)) | ((x) >> (64 - (n))))
-
-#define cuda_swab64(x) \
-		((uint64_t)((((uint64_t)(x) & 0xff00000000000000ULL) >> 56) | \
-			(((uint64_t)(x) & 0x00ff000000000000ULL) >> 40) | \
-			(((uint64_t)(x) & 0x0000ff0000000000ULL) >> 24) | \
-			(((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) | \
-			(((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) | \
-			(((uint64_t)(x) & 0x0000000000ff0000ULL) << 24) | \
-			(((uint64_t)(x) & 0x000000000000ff00ULL) << 40) | \
-			(((uint64_t)(x) & 0x00000000000000ffULL) << 56)))
-__device__ __forceinline__
-uint64_t SWAPDWORDS(uint64_t value)
-{
-#if __CUDA_ARCH__ >= 320
-	uint2 temp;
-	asm("mov.b64 {%0, %1}, %2; ": "=r"(temp.x), "=r"(temp.y) : "l"(value));
-	asm("mov.b64 %0, {%1, %2}; ": "=l"(value) : "r"(temp.y), "r"(temp.x));
-	return value;
-#else
-	return ROTL64(value, 32);
-#endif
-}
+
+
+__constant__ static uint64_t __align__(8) c_512[16];
+__constant__ static uint64_t __align__(8) c_vblake[8];
 
 #define B2B_G(v,a,b,c,d,x,y,c1,c2) { \
 	v[a] = v[a] + v[b] + (x ^ c1); \
@@ -100,8 +81,7 @@ static const uint8_t c_sigma_big[16][16] = {
 	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };
 
-__device__ __constant__
-static const uint64_t c_u512[16] =
+static const uint64_t cpu_u512[16] =
 {
 	0xA51B6A89D489E800ULL, 0xD35B2E0E0B723800ULL,
 	0xA47B39A2AE9F9000ULL, 0x0C0EFA33E77E6488ULL,
@@ -113,38 +93,48 @@ static const uint64_t c_u512[16] =
 	0xD859E6F081AAE000ULL, 0x63D980597B560E6BULL
 };
 
-__device__ __constant__
-static const uint64_t vBlake_iv[8] = {
+static const uint64_t cpu_vBlake_iv[8] = {
 	0x4BBF42C1F006AD9Dull, 0x5D11A8C3B5AEB12Eull,
 	0xA64AB78DC2774652ull, 0xC67595724658F253ull,
 	0xB8864E79CB891E56ull, 0x12ED593E29FB41A1ull,
 	0xB1DA3AB63C60BAA8ull, 0x6D20E50C1F954DEDull
 };
 
+
 __device__ 
-void vblake512_compress(uint64_t *h, const uint64_t *block, const uint8_t((*sigma)[16]), const uint64_t *u512)
+uint64_t vBlake2(const uint64_t h0, const uint64_t h1, const uint64_t h2, const uint64_t h3, const uint64_t h4, const uint64_t h5, const uint64_t h6, const uint64_t h7, const uint64_t* u512, const uint64_t* s_vBlake, const uint8_t((*sigma)[16]))
 {
-	uint64_t v[16];
-	uint64_t m[16];
+	uint64_t h[8];
+    uint64_t v[16];
+	uint64_t m[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 };
 
-	//#pragma unroll 8
-	for (int i = 0; i < 8; i++) {
-		v[i] = h[i];
-		v[i + 8] = vBlake_iv[i];
-	}
+	h[0]  = v[8]  =  s_vBlake[0];
+	
+	h[0] ^= (uint64_t)(0x01010000 ^ 0x18);
+	v[0] = h[0];
+	v[9]  = v[1] = s_vBlake[1];
+	v[10] = v[2] = s_vBlake[2];
+	h[3]  = v[11] = v[3] = s_vBlake[3];
+	v[12] = v[4] = s_vBlake[4];
+	v[13] = v[5] = s_vBlake[5];
+	h[6]  = v[14] = v[6] = s_vBlake[6];
+	v[15] = v[7] = s_vBlake[7];
 	
+	
+
+	m[0] = h0;
+	m[1] = h1;
+	m[2] = h2;
+	m[3] = h3;
+	m[4] = h4;
+	m[5] = h5;
+	m[6] = h6;
+	m[7] = h7;		
+	//vblake512_compress(h, b, c_sigma_big, s_u512);
 	v[12] ^= 64;
-	//v[13] ^= 0;
 	v[14] ^= (uint64_t)(0xffffffffffffffffull);// (long)(-1);
-	//v[15] ^= 0;
 
-//#pragma unroll 8
-	for (int i = 0; i < 8; i++) {
-		m[i] = block[i]; // cuda_swab64(block[i]); orORINGNAL BLAKE
-	}
-
-
-	//#pragma unroll 16
+	#pragma unroll 16
 	for (int i = 0; i < 16; i++) {
 		B2B_G(v, 0, 4, 8, 12, m[sigma[i][1]], m[sigma[i][0]],
 			u512[sigma[i][1]], u512[sigma[i][0]]);
@@ -172,43 +162,12 @@ void vblake512_compress(uint64_t *h, const uint64_t *block, const uint8_t((*sigm
 	}
 
 	h[0] ^= v[0] ^ v[8];
-//	h[1] ^= v[1] ^ v[9];
-//	h[2] ^= v[2] ^ v[10];
+
 	h[3] ^= v[3] ^ v[11];
-//	h[4] ^= v[4] ^ v[12];
-//	h[5] ^= v[5] ^ v[13];
+
 	h[6] ^= v[6] ^ v[14];
-//	h[7] ^= v[7] ^ v[15];
 
 	h[0] ^= h[3] ^ h[6];  //copied from  the java
-	//h[1] ^= h[4] ^ h[7];
-	//h[2] ^= h[5];
-}
-__device__ __forceinline__
-uint64_t vBlake2(const uint64_t h0, const uint64_t h1, const uint64_t h2, const uint64_t h3, const uint64_t h4, const uint64_t h5, const uint64_t h6, const uint64_t h7)
-{
-	uint64_t b[8];
-	uint64_t h[8];
-
-	for (int i = 0; i < 8; i++) {
-		h[i] = vBlake_iv[i];
-	}
-	h[0] ^= (uint64_t)(0x01010000 ^ 0x18);
-
-	b[0] = h0;
-	b[1] = h1;
-	b[2] = h2;
-	b[3] = h3;
-	b[4] = h4;
-	b[5] = h5;
-	b[6] = h6;
-	b[7] = h7;
-		
-	vblake512_compress(h, b, c_sigma_big, c_u512);
-	
-	//for (int i = 0; i < 8; i++) {
-	//	b[0] = cuda_swab64(h[0]);
-	//}
 	return h[0];
 }
 
@@ -216,11 +175,11 @@ uint64_t vBlake2(const uint64_t h0, const uint64_t h1, const uint64_t h2, const
 #if CPU_SHARES
 #define WORK_PER_THREAD 256
 #else
-#define WORK_PER_THREAD 256
+#define WORK_PER_THREAD 1
 #endif
 
 #if HIGH_RESOURCE
-#define DEFAULT_BLOCKSIZE 512
+#define DEFAULT_BLOCKSIZE 0x80000
 #define DEFAULT_THREADS_PER_BLOCK 1024
 #else
 #define DEFAULT_BLOCKSIZE 512
@@ -235,30 +194,25 @@ bool verboseOutput = false;
 /*
 * Kernel function to search a range of nonces for a solution falling under the macro-configured difficulty (CPU=2^24, GPU=2^32).
 */
-__launch_bounds__(256, 2)
+//__launch_bounds__(256, 2)
 __global__ void vblakeHasher(uint32_t *nonceStart, uint32_t *nonceOut, uint64_t *hashStartOut, uint64_t const *headerIn)
 {
 	// Generate a unique starting nonce for each thread that doesn't overlap with the work of any other thread
-	const uint32_t workStart = ((blockDim.x * blockIdx.x + threadIdx.x)  * WORK_PER_THREAD) + nonceStart[0];
-
+	const uint32_t workStart = ((blockDim.x * blockIdx.x + threadIdx.x)) + nonceStart[0];
+	__shared__ uint64_t s_u512[16],s_vblake[8];
+
+	if (threadIdx.x < 16U) s_u512[threadIdx.x] = c_512[threadIdx.x];
+	if (threadIdx.x < 8U) s_vblake[threadIdx.x] = c_vblake[threadIdx.x];
 	uint64_t nonceHeaderSection = headerIn[7];
-
-	// Run the hash WORK_PER_THREAD times
-	for (unsigned int nonce = workStart; nonce < workStart + WORK_PER_THREAD; nonce++) {
+	unsigned int nonce = workStart;
+	//for (unsigned int nonce = workStart; nonce < workStart + WORK_PER_THREAD; nonce++) {
 		// Zero out nonce position and write new nonce to last 32 bits of prototype header
 		nonceHeaderSection &= 0x00000000FFFFFFFFu;
 		nonceHeaderSection |= (((uint64_t)nonce) << 32);
 
-		uint64_t hashStart = vBlake2(headerIn[0], headerIn[1], headerIn[2], headerIn[3], headerIn[4], headerIn[5], headerIn[6], nonceHeaderSection);
+		uint64_t hashStart = vBlake2(headerIn[0], headerIn[1], headerIn[2], headerIn[3], headerIn[4], headerIn[5], headerIn[6], nonceHeaderSection, s_u512, s_vblake, c_sigma_big);
 
-		if ((hashStart &
-
-#if CPU_SHARES
-			0x0000000000FFFFFFu // 2^24 difficulty
-#else
-			0x00000000FFFFFFFFu // 2^32 difficulty
-#endif
-			) == 0) {
+		if ((hashStart & 0x00000000FFFFFFFFu) == 0) {
 			// Check that found solution is better than existing solution if one has already been found on this run of the kernel (always send back highest-quality work)
 			if (hashStartOut[0] > hashStart || hashStartOut[0] == 0) {
 				nonceOut[0] = nonce;
@@ -266,9 +220,9 @@ __global__ void vblakeHasher(uint32_t *nonceStart, uint32_t *nonceOut, uint64_t
 			}
 
 			// exit loop early
-			nonce = workStart + WORK_PER_THREAD;
+			//nonce = workStart + WORK_PER_THREAD;
 		}
-	}
+	//}
 }
 
 void promptExit(int exitCode)
@@ -987,6 +941,9 @@ cudaError_t grindNonces(uint32_t *nonceResult, uint64_t *hashStart, const uint64
 		goto Error;
 	}
 
+	cudaMemcpyToSymbol(c_512, cpu_u512, sizeof(cpu_u512), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_vblake, cpu_vBlake_iv, sizeof(cpu_vBlake_iv), 0, cudaMemcpyHostToDevice);
+
 	// Launch a kernel on the GPU with one thread for each element.
 	vblakeHasher << < blocksize, threadsPerBlock >> >(dev_nonceStart, dev_nonceResult, dev_hashStart, dev_header);