diff --git a/CMakeLists.txt b/CMakeLists.txt
index f1fb146e5..79a0fab34 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -542,10 +542,19 @@ target_link_libraries(xmr-stak-rx ${MHTD} ${LIBS} xmr-stak-rx-backend)
 # Install
 ################################################################################
 
+# install booster script
+if(NOT WIN32)
+    install(DIRECTORY "${CMAKE_SOURCE_DIR}/scripts/" DESTINATION ${CMAKE_INSTALL_PREFIX}/${LIBRARY_OUTPUT_PATH}
+        FILES_MATCHING PATTERN "*.sh"
+        PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE GROUP_READ GROUP_EXECUTE
+        PATTERN .git EXCLUDE
+    )
+endif()
+
 # do not install the binary if the project and install are equal
 if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR )
     install(TARGETS xmr-stak-rx
-            RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}")
+        RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}")
     if(CUDA_FOUND)
         if(WIN32)
             install(TARGETS xmrstakrx_cuda_backend
diff --git a/doc/tuning.md b/doc/tuning.md
index 19427b4b2..d7151a61f 100644
--- a/doc/tuning.md
+++ b/doc/tuning.md
@@ -2,6 +2,7 @@
 
 ## Content Overview
 * [Fast Startup](#fast-startup)
+* [Booster Script](#booster-script)
 * [Huge Page Support](#huge-page-support)
 * [Benchmark](#benchmark)
 * [Windows](#windows)
@@ -24,9 +25,19 @@
 ## Fast Startup
 You can disable the miner self test performed on each miner start by adding the command line option `--noTest`
 
+## Booster Script
+
+The linux booster script manipulates the CPU caching behavior and activates huge pages.
+
+Call `sudo ./randomx_booster.sh`. 
+
+The booster script will try to install all dependencies if need. 
+If the script can not install the dependencies (e.g. unknown systems) please install the tools `wrmsr` and `numactl`.
+
 ## Huge Page Support
 
 In Linux you can enable 2 MiB huge pages with the following command.
+In linux you can use our [booster script](#booster-script) to active huge pages
 
 ```
 sudo sysctl -w vm.nr_hugepages=1300
diff --git a/scripts/randomx_booster.sh b/scripts/randomx_booster.sh
new file mode 100755
index 000000000..f36f38c19
--- /dev/null
+++ b/scripts/randomx_booster.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# based on xmrig's randomx_boost.sh script
+# lifted by psychocrypt
+
+function help()
+{
+    echo "$(basename $0) modifies caching behaviors of your CPU"
+    echo "and activates huge pages."
+    echo "Reboot your system to revert the changes."
+    echo ""
+    echo "must be called with administrative privileges e.g. 'sudo $(basename $0)'"
+}
+
+if [ $# -ge 1 ] ; then
+    help
+    exit 1
+fi
+
+hasAptGet=$(which apt-get >/dev/null && { echo 1; } || { echo 0; })
+hasApt=$(which apt >/dev/null && { echo 1; } || { echo 0; })
+hasYum=$(which yum >/dev/null && { echo 1; } || { echo 0; })
+
+tools=$(which wrmsr >/dev/null || { echo "msr-tools "; })$(which numactl >/dev/null || { echo " numactl"; })
+
+if [ -n "$tools" ] ; then
+    echo "tool '$tools' not found, $(basename $0) is trying to install the dependency"
+    if [ $hasAptGet -eq 1 ] ; then
+        comm="apt-get --no-install-recommends --yes install $tools"
+        echo "execute: $comm"
+        $comm
+    elif [ $hasApt -eq 1 ] ; then
+        comm="apt-get --no-install-recommends --yes install $tools"
+        echo "execute: $comm"
+        $comm
+    elif [ $hasYum -eq 1 ] ; then
+        comm="yum install -y $tools"
+        echo "execute: $comm"
+        $comm
+    else
+        echo "package manager unknown, please install '$tools' by hand" >&2
+        exit 1
+    fi
+fi
+
+hasWrmsr=$(which wrmsr >/dev/null && { echo 1; } || { echo 0; })
+if [ $hasWrmsr -eq 0 ] ; then
+    echo "dependency 'wrmsr' not found, tool failed" >&2
+exit 1
+fi
+
+hasNumactl=$(which numactl >/dev/null && { echo 1; } || { echo 0; })
+if [ $hasNumactl -eq 0 ] ; then
+    echo "dependency 'numactl' not found, tool failed" >&2
+    exit 1
+fi
+
+echo ""
+modprobe msr
+
+if cat /proc/cpuinfo | grep -q "AMD Ryzen" ; then
+    echo "Detected Ryzen"
+    wrmsr -a 0xc0011022 0x510000
+    wrmsr -a 0xc001102b 0x1808cc16
+    wrmsr -a 0xc0011020 0
+    echo "MSR register values for Ryzen applied"
+    echo "WARNING: MSR register changes can result into stability issues!"
+    echo "Reboot your system to revert the changes."
+elif cat /proc/cpuinfo | grep -q "Intel" ; then
+    echo "Detected Intel"
+    wrmsr -a 0x1a4 7
+    echo "MSR register values for Intel applied"
+    echo "WARNING: MSR register changes can result into stability issues!"
+    echo "Reboot your system to revert the changes."
+else
+    echo "No supported CPU detected"
+fi
+
+echo ""
+
+### begin enable huge pages
+required_num_huge_pages=1280
+num_huge_pages=$(cat /proc/meminfo | grep "HugePages_Free" | sed 's/ \{2,\}/ /g' | cut -d" " -f2)
+
+if [ $num_huge_pages -lt $required_num_huge_pages ] ; then
+    echo "active 2 MiB pages"
+    echo "execute: sysctl -w vm.nr_hugepages=$required_num_huge_pages"
+    sysctl -w vm.nr_hugepages="$required_num_huge_pages"
+fi
+# verify number of huge pages
+num_huge_pages=$(cat /proc/meminfo | grep "HugePages_Free" | sed 's/ \{2,\}/ /g' | cut -d" " -f2)
+num_memsets=$((num_huge_pages/required_num_huge_pages))
+
+if [ $num_memsets -eq 0 ] ; then
+    echo "Error: not enough 2 MiB pages $num_huge_pages/$required_num_huge_pages" >&2
+fi
+
+# apply gigabyte pages last because 2MiB pages will give more performance
+numNodes=$(numactl --hardware | grep available | cut -d" " -f2)
+freeGigPages=$(cat /sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages)
+neededGigPages=$((numNodes * 3))
+
+if [ $freeGigPages -lt $neededGigPages ] ; then
+    echo ""
+    echo "activate 1 GiB pages"
+    comm="echo $neededGigPages > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages"
+    echo "execute: $comm"
+    echo "$neededGigPages" > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages
+fi
+### end enable huge pages
+
+exit 0
diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp
index 7d9484999..c15b1cb4d 100644
--- a/xmrstak/backend/amd/amd_gpu/gpu.cpp
+++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp
@@ -164,7 +164,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	size_t scratchPadSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		scratchPadSize = std::max(scratchPadSize, algo.Mem());
+		scratchPadSize = std::max(scratchPadSize, algo.L3());
 	}
 
 	size_t g_thd = ctx->rawIntensity;
@@ -182,7 +182,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 			ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret);
 		}
 		else {
-			void* dataset = getRandomXDataset();
+			void* dataset = getRandomXDataset(0);
 			ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, dataset, &ret);
 		}
 
@@ -193,7 +193,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 		}
 	}
 
-	ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.Mem() + 64) * g_thd, nullptr, &ret);
+	ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.L3() + 64) * g_thd, nullptr, &ret);
 	if(ret != CL_SUCCESS)
 	{
 		printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create RandomX scratchpads.", err_to_str(ret));
@@ -294,9 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_
 	for(const auto miner_algo : neededAlgorithms)
 	{
 		// scratchpad size for the selected mining algorithm
-		size_t hashMemSize = miner_algo.Mem();
-		int threadMemMask = miner_algo.Mask();
-		int hashIterations = miner_algo.Iter();
+		size_t hashMemSize = miner_algo.L3();
 
 		std::string options;
 		options += " -DALGO=" + std::to_string(miner_algo.Id());
@@ -1364,7 +1362,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment)
 size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, const uint8_t* seed_hash, const xmrstak_algo& miner_algo)
 {
 	cl_int ret;
-	void* dataset = getRandomXDataset();
+	void* dataset = getRandomXDataset(0);
 	const size_t dataset_size = getRandomXDatasetSize();
 
 	if((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0))
diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index 6b9c95462..5167815cc 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -84,7 +84,7 @@ class autoAdjust
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			hashMemSize = std::max(hashMemSize, algo.L3());
 		}
 
 		std::string conf;
@@ -171,9 +171,10 @@ class autoAdjust
 				ctx.gcnAsm = false;
 
 
-			if(hashMemSize < CN_MEMORY)
+			size_t _2MiB = 2llu * 1024 * 1024;
+			if(hashMemSize < _2MiB)
 			{
-				size_t factor = CN_MEMORY / hashMemSize;
+				size_t factor = _2MiB / hashMemSize;
 				// increase all intensity relative to the original scratchpad size
 				maxThreads *= factor;
 			}
diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp
index 3631e3e1d..08d80ca42 100644
--- a/xmrstak/backend/amd/minethd.cpp
+++ b/xmrstak/backend/amd/minethd.cpp
@@ -27,7 +27,7 @@
 
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/configEditor.hpp"
@@ -68,9 +68,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th
 
 	order_guard.wait();
 
+#if defined(CONF_NO_HWLOC) || defined(_WIN32)
 	if(affinity >= 0) //-1 means no affinity
 		if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
+#endif
 }
 
 extern "C"
@@ -164,7 +166,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 void minethd::work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
+		hwlocBind(affinity);
 
 	order_fix.set_value();
 	std::unique_lock<std::mutex> lck(thd_aff_set);
@@ -173,6 +175,8 @@ void minethd::work_main()
 
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+	cpu_ctx->numa = affinity < 0 ? 0 : numdaId(affinity);
+	randomX_global_ctx::inst().init(cpu_ctx->numa);
 
 	if(cpu_ctx == nullptr)
 	{
diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp
index caa3530ac..1602e631f 100644
--- a/xmrstak/backend/cpu/autoAdjust.hpp
+++ b/xmrstak/backend/cpu/autoAdjust.hpp
@@ -31,7 +31,7 @@ class autoAdjust
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			hashMemSize = std::max(hashMemSize, algo.L3());
 		}
 		const size_t hashMemSizeKB = hashMemSize / 1024u;
 
diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
index f06244c8a..61dacad54 100644
--- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp
+++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp
@@ -4,6 +4,7 @@
 #include "xmrstak/misc/configEditor.hpp"
 #include "xmrstak/misc/console.hpp"
 #include "xmrstak/params.hpp"
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
 
 #ifdef _WIN32
 #include <windows.h>
@@ -15,6 +16,7 @@
 
 #include <hwloc.h>
 #include <stdio.h>
+#include <algorithm>
 
 namespace xmrstak
 {
@@ -30,9 +32,9 @@ class autoAdjustHwloc
 
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			l3MemRequire = std::max(l3MemRequire, algo.L3());
+			l2MemRequire = std::max(l2MemRequire, algo.L2());
 		}
-		halfHashMemSize = hashMemSize / 2u;
 	}
 
 	bool printConfig()
@@ -40,7 +42,8 @@ class autoAdjustHwloc
 
 		hwloc_topology_t topology;
 		hwloc_topology_init(&topology);
-		hwloc_topology_load(topology);
+		if(hwloc_topology_load(topology) < 0)
+			return false;
 
 		std::string conf;
 		configEditor configTpl{};
@@ -54,25 +57,24 @@ class autoAdjustHwloc
 		bool is_successful = true;
 		try
 		{
-			std::vector<hwloc_obj_t> tlcs;
-			tlcs.reserve(16);
-			results.reserve(16);
 
+			std::vector<hwloc_obj_t> tlcs;
 			findChildrenCaches(hwloc_get_root_obj(topology),
 				[&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); });
 
 			if(tlcs.size() == 0)
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
-
+			printer::inst()->print_msg(LDEBUG,"process %u cache elements", uint32_t(tlcs.size()));
 			for(hwloc_obj_t obj : tlcs)
 				processTopLevelCache(obj);
 
-			for(uint32_t id : results)
+
+			for(const auto& thd : threads)
 			{
 				conf += std::string("    { \"low_power_mode\" : ");
-				conf += std::string((id & 0x8000000) != 0 ? "true" : "false");
+				conf += std::to_string(thd.num_hashes);
 				conf += std::string(", \"affine_to_cpu\" : ");
-				conf += std::to_string(id & 0x7FFFFFF);
+				conf += std::to_string(thd.core_id);
 				conf += std::string(" },\n");
 			}
 		}
@@ -92,10 +94,20 @@ class autoAdjustHwloc
 	}
 
   private:
-	size_t hashMemSize = 0;
-	size_t halfHashMemSize = 0;
+	size_t l3MemRequire = 0;
+	size_t l2MemRequire = 0;
+
+	struct Thread
+	{
+		Thread(const uint32_t c_id, const uint32_t n_hash) :
+			core_id(c_id), num_hashes(n_hash)
+		{}
+
+		uint32_t core_id = 0;
+		uint32_t num_hashes = 1;
+	};
 
-	std::vector<uint32_t> results;
+	std::vector<Thread> threads;
 
 	template <typename func>
 	inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda)
@@ -143,16 +155,16 @@ class autoAdjustHwloc
 		if(obj->attr == nullptr)
 			throw(std::runtime_error("Cache object hasn't got attributes."));
 
-		size_t PUs = 0;
-		findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; });
+		size_t numPUs = 0;
+		findChildrenByType(obj, HWLOC_OBJ_PU, [&numPUs](hwloc_obj_t found) { numPUs++; });
 
 		//Strange case, but we will handle it silently, surely there must be one PU somewhere?
-		if(PUs == 0)
+		if(numPUs == 0)
 			return;
 
 		if(obj->attr->cache.size == 0)
 		{
-			//We will always have one child if PUs > 0
+			//We will always have one child if numPUs > 0
 			if(!isCacheObject(obj->children[0]))
 				throw(std::runtime_error("The CPU doesn't seem to have a cache."));
 
@@ -162,27 +174,58 @@ class autoAdjustHwloc
 			return;
 		}
 
-		size_t cacheSize = obj->attr->cache.size;
-		if(isCacheExclusive(obj))
+		size_t l3CacheSize = obj->attr->cache.size;
+		size_t numL2Caches = obj->arity;
+		bool isExclusive = isCacheExclusive(obj);
+		size_t l2CacheSize = 0u;
+		if(obj->attr->cache.depth == 3)
 		{
-			for(size_t i = 0; i < obj->arity; i++)
+			for(size_t i = 0; i < numL2Caches; i++)
 			{
 				hwloc_obj_t l2obj = obj->children[i];
-				//If L2 is exclusive and greater or equal to 2MB add room for one more hash
-				if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashMemSize)
-					cacheSize += hashMemSize;
+				if(isCacheObject(l2obj) && l2obj->attr)
+				{
+					//If L3 is exclusive and greater or equal to 2MB add room for one more hash
+					if(isExclusive && l2obj->attr->cache.size >= l3MemRequire)
+						l3CacheSize += l3MemRequire;
+					else
+						l2CacheSize += l2obj->attr->cache.size;
+				}
 			}
 		}
 
+		size_t l2CacheSizePerHash = l2CacheSize / numL2Caches;
+		printer::inst()->print_msg(LDEBUG,"%u L3 cache, required per hash %u", uint32_t(l3CacheSize), uint32_t(l3MemRequire));
+		printer::inst()->print_msg(LDEBUG,"%u L2 cache, required per hash %u", uint32_t(l2CacheSize), uint32_t(l2MemRequire));
+
+		size_t l3CacheHashes = std::max(l3CacheSize / l3MemRequire, size_t(1u));
+		size_t l2CacheHashes = std::max(l2CacheSizePerHash / l2MemRequire, size_t(1u)) * numL2Caches;
+
+		// we have no lvl2 cache or our top lvl cache is L2
+		if(l2CacheSize == 0u)
+			l2CacheHashes = l3CacheHashes;
+
 		std::vector<hwloc_obj_t> cores;
 		cores.reserve(16);
 		findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); });
 
-		size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize;
+		printer::inst()->print_msg(LDEBUG,"%u L3 hash limit", uint32_t(l3CacheHashes));
+		printer::inst()->print_msg(LDEBUG,"%u L2 hash limit", uint32_t(l2CacheHashes));
+		printer::inst()->print_msg(LDEBUG,"%u PU(s) available", uint32_t(numPUs));
+		size_t numHashCacheLimited = std::min(l2CacheHashes, l3CacheHashes);
+		// do not use more PUs than available
+		size_t usePus = std::min(numHashCacheLimited, numPUs);
+
+		// currently do not use multi hash per PU (all tests has shown it is slower)
+		//size_t numHashesPerPu = std::max(numHashCacheLimited / numPUs, size_t(1u));
+		size_t numHashesPerPu = 1u;
+
+		printer::inst()->print_msg(LDEBUG,"use %u PU(s)", uint32_t(usePus));
+		printer::inst()->print_msg(LDEBUG,"use %u hashe(s) per pu", uint32_t(numHashesPerPu));
 
 		//Firstly allocate PU 0 of every CORE, then PU 1 etc.
 		size_t pu_id = 0;
-		while(cacheHashes > 0 && PUs > 0)
+		while(usePus > 0)
 		{
 			bool allocated_pu = false;
 			for(hwloc_obj_t core : cores)
@@ -192,19 +235,12 @@ class autoAdjustHwloc
 
 				size_t os_id = core->children[pu_id]->os_index;
 
-				if(cacheHashes > PUs)
-				{
-					cacheHashes -= 2;
-					os_id |= 0x8000000; //double hash marker bit
-				}
-				else
-					cacheHashes--;
-				PUs--;
-
 				allocated_pu = true;
-				results.emplace_back(os_id);
+				threads.emplace_back(Thread(os_id, numHashesPerPu));
+
+				usePus--;
 
-				if(cacheHashes == 0)
+				if(usePus == 0)
 					break;
 			}
 
diff --git a/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp b/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp
index 68e970839..e53aced5e 100644
--- a/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp
+++ b/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp
@@ -37,6 +37,11 @@
 #   include <mach/vm_statistics.h>
 #endif
 
+#if defined(__linux__) && !defined(MAP_HUGE_SHIFT)
+#	include <asm-generic/mman-common.h>
+#endif
+
+#include "xmrstak/misc/console.hpp"
 
 int xmrstak::VirtualMemory::m_globalFlags = 0;
 
@@ -109,8 +114,7 @@ void *xmrstak::VirtualMemory::allocateLargePagesMemory(size_t size, size_t page_
 		page_size_flags |= MAP_HUGE_2MB;
 	else if(page_size == 1024u)
 		page_size_flags |= MAP_HUGE_1GB;
-	#define MAP_HUGE_2MB    (21 << MAP_HUGE_SHIFT)
-	#define MAP_HUGE_1GB    (30 << MAP_HUGE_SHIFT)
+
     void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE | page_size_flags, 0, 0);
 #   endif
 
@@ -128,7 +132,16 @@ void xmrstak::VirtualMemory::flushInstructionCache(void *p, size_t size)
 
 void xmrstak::VirtualMemory::freeLargePagesMemory(void *p, size_t size)
 {
-    munmap(p, size);
+    if(munmap(p, size) != 0)
+	{
+		printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)size);
+		size_t page3gib = 3llu*1024*1024*1024;
+		printer::inst()->print_msg(LDEBUG,"try to unmap ", page3gib);
+		if(munmap(p, page3gib) != 0)
+		{
+			printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)page3gib);
+		}
+	}
 }
 
 
diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h
index 8b9e9e39f..0b5ec04e2 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight.h
@@ -13,10 +13,16 @@
 
 #if defined _MSC_VER
 #define ABI_ATTRIBUTE
+#include <winsock2.h>
+#include <windows.h>
+#include <ntsecapi.h>
 #else
 #define ABI_ATTRIBUTE __attribute__((ms_abi))
 #endif
 
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
+#include "cryptonight_1.h"
+
 struct cryptonight_ctx;
 
 typedef void (*cn_mainloop_fun)(cryptonight_ctx* ctx);
@@ -34,6 +40,7 @@ struct cryptonight_ctx
 	cn_hash_fun hash_fn = nullptr;
 	uint8_t* fun_data = nullptr;
 	xmrstak_algo last_algo = invalid_algo;
+	uint32_t numa = 0;
 
 	randomx_vm* m_rx_vm = nullptr;
 };
@@ -53,9 +60,9 @@ struct randomX_global_ctx
 		return *env.pGlobalCtx;
 	}
 
-	randomx_dataset* getDataset()
+	randomx_dataset* getDataset(size_t numaId)
 	{
-		return m_rx_dataset;
+		return m_rx_datasets[numaId];
 	}
 
 	void updateDataset(const std::array<uint8_t, 32>& seed_hash, const uint32_t num_threads)
@@ -75,7 +82,7 @@ struct randomX_global_ctx
 
 		// One of the threads updates cache
 		{
-			std::lock_guard<std::mutex> g(m_rx_dataset_lock);
+			std::lock_guard<std::mutex> g(m_rx_cache_lock);
 			if(m_rx_seed_hash != seed_hash)
 			{
 				m_rx_seed_hash = seed_hash;
@@ -87,7 +94,15 @@ struct randomX_global_ctx
 		const uint32_t a = (randomx_dataset_item_count() * static_cast<uint64_t>(thread_id)) / num_threads;
 		const uint32_t b = (randomx_dataset_item_count() * (static_cast<uint64_t>(thread_id) + 1u)) / num_threads;
 		printer::inst()->print_msg(LDEBUG,"Thread %u start updating RandomX dataset %u %u", thread_id, a, b);
-		randomx_init_dataset(m_rx_dataset, m_rx_cache, a, b - a);
+		size_t numElements = b - a;
+		randomx_init_dataset(m_rx_datasets[0], m_rx_cache, a, numElements);
+		for(size_t i = 1; i < m_rx_datasets.size(); ++i)
+		{
+			if(m_rx_datasets[i] != nullptr)
+			{
+				memcpy((uint8_t*)getRandomXDataset(i) + a * 64u, (uint8_t*)getRandomXDataset(0) + a * 64u, numElements * 64u);
+			}
+		}
 
 		printer::inst()->print_msg(LDEBUG,"Thread %u finished updating RandomX dataset", thread_id);
 
@@ -98,42 +113,88 @@ struct randomX_global_ctx
 		} while (m_rx_dataset_init_thread_counter.load() != 0);
 	}
 
-private:
-	randomX_global_ctx() : m_rx_dataset_init_thread_counter(0u)
+	void init(size_t numaId)
 	{
-#ifdef __linux__
-		randomx_dataset* dataset = randomx_alloc_dataset(static_cast<randomx_flags>(RANDOMX_FLAG_LARGE_PAGES | RANDOMX_FLAG_LARGE_PAGES_1G));
-		if (!dataset)
 		{
-			printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 1 GiB pages failed");
-#else
-			randomx_dataset* dataset = nullptr;
-#endif
-			dataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES);
+			std::unique_lock<std::mutex> lck(dataset_locks[numaId]);
+			if(m_rx_datasets[numaId])
+			{
+				printer::inst()->print_msg(LDEBUG,"dataset/cache already created for numa %u", uint32_t(numaId));
+				return;
+			}
+			printer::inst()->print_msg(LDEBUG,"allocate dataset/cache for numa %u", uint32_t(numaId));
+	#ifdef __linux__
+			randomx_dataset* dataset = randomx_alloc_dataset(static_cast<randomx_flags>(RANDOMX_FLAG_LARGE_PAGES | RANDOMX_FLAG_LARGE_PAGES_1G));
 			if (!dataset)
 			{
-				printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 2 MiB pages failed");
-				dataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT);
-				printer::inst()->print_msg(LDEBUG,"dataset allocated without huge pages");
+				printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 1 GiB pages failed");
+	#else
+				randomx_dataset* dataset = nullptr;
+	#endif
+				dataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES);
+				if (!dataset)
+				{
+					printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 2 MiB pages failed");
+					dataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT);
+					printer::inst()->print_msg(LDEBUG,"dataset allocated without huge pages");
+				}
+				else
+					printer::inst()->print_msg(LDEBUG,"dataset allocated with 2 MiB pages");
+	#ifdef __linux__
 			}
 			else
-				printer::inst()->print_msg(LDEBUG,"dataset allocated with 2 MiB pages");
-#ifdef __linux__
+				printer::inst()->print_msg(LDEBUG,"dataset allocated with 1 GiB pages");
+	#endif
+
+			m_rx_datasets[numaId] = dataset;
 		}
-		else
-			printer::inst()->print_msg(LDEBUG,"dataset allocated with 1 GiB pages");
-#endif
+		{
+			std::unique_lock<std::mutex> lck(m_rx_cache_lock);
+			if(numaId == 0 && m_rx_cache == nullptr)
+			{
+				m_rx_cache = randomx_alloc_cache(static_cast<randomx_flags>(RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES));
+				if (!m_rx_cache) {
+					m_rx_cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
+				}
+			}
+		}
+	}
 
-		m_rx_cache = randomx_alloc_cache(static_cast<randomx_flags>(RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES));
-		if (!m_rx_cache) {
-			m_rx_cache = randomx_alloc_cache(RANDOMX_FLAG_JIT);
+	void release(size_t numaId)
+	{
+		{
+			std::unique_lock<std::mutex> lck(dataset_locks[numaId]);
+			if(!m_rx_datasets[numaId])
+			{
+				printer::inst()->print_msg(LDEBUG,"dataset/cache for numa %u alreday released", uint32_t(numaId));
+				return;
+			}
+			printer::inst()->print_msg(LDEBUG,"release dataset/cache for numa %u", uint32_t(numaId));
+			randomx_release_dataset(m_rx_datasets[numaId]);
+			m_rx_datasets[numaId] = nullptr;
 		}
-		m_rx_dataset = dataset;
+		{
+			std::unique_lock<std::mutex> lck(m_rx_cache_lock);
+			if(numaId == 0 && m_rx_cache)
+			{
+				randomx_release_cache(m_rx_cache);
+				m_rx_cache = nullptr;
+			}
+		}
+	}
+
+private:
+	randomX_global_ctx() : m_rx_dataset_init_thread_counter(0u)
+	{
+		size_t numNumaNodes = getNumNumaNodes();
+		m_rx_datasets.resize(numNumaNodes, nullptr);
+		dataset_locks.reset(new std::mutex[numNumaNodes]);
 	}
 
-	std::mutex m_rx_dataset_lock;
+	std::mutex m_rx_cache_lock;
 	randomx_cache* m_rx_cache = nullptr;
-	randomx_dataset* m_rx_dataset = nullptr;
+	std::unique_ptr<std::mutex[]> dataset_locks;
+	std::vector<randomx_dataset*> m_rx_datasets;
 	std::array<uint8_t, 32> m_rx_seed_hash = {{0}};
 	std::atomic<uint32_t> m_rx_dataset_init_thread_counter;
 };
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_1.cpp b/xmrstak/backend/cpu/crypto/cryptonight_1.cpp
index b4b3a4e19..719faae53 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_1.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_1.cpp
@@ -3,9 +3,9 @@
 #include "randomx/randomx.h"
 
 
-void* getRandomXDataset()
+void* getRandomXDataset(const size_t numaId)
 {
-	return randomx_get_dataset_memory(randomX_global_ctx::inst().getDataset());
+	return randomx_get_dataset_memory(randomX_global_ctx::inst().getDataset(numaId));
 }
 
 
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_1.h b/xmrstak/backend/cpu/crypto/cryptonight_1.h
index 73e797155..d6c28c954 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_1.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_1.h
@@ -1,5 +1,5 @@
 #pragma once
 
-void* getRandomXDataset();
+void* getRandomXDataset(const size_t numaId);
 
 uint64_t getRandomXDatasetSize();
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
index 984b2c502..10b590bf0 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
+++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h
@@ -55,44 +55,6 @@ struct RandomX_hash
 namespace
 {
 
-template <typename T, typename U>
-static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask)
-{
-	const uint8_t* p = reinterpret_cast<const uint8_t*>(src);
-
-	// Workaround for Visual Studio placing trampoline in debug builds.
-#if defined(_MSC_VER)
-	if(p[0] == 0xE9)
-	{
-		p += *(int32_t*)(p + 1) + 5;
-	}
-#endif
-
-	size_t size = 0;
-	while(*(uint32_t*)(p + size) != 0xDEADC0DE)
-	{
-		++size;
-	}
-	size += sizeof(uint32_t);
-
-	memcpy((void*)dst, (const void*)src, size);
-
-	uint8_t* patched_data = reinterpret_cast<uint8_t*>(dst);
-	for(size_t i = 0; i + sizeof(uint32_t) <= size; ++i)
-	{
-		switch(*(uint32_t*)(patched_data + i))
-		{
-		case CN_ITER:
-			*(uint32_t*)(patched_data + i) = iterations;
-			break;
-
-		case CN_MASK:
-			*(uint32_t*)(patched_data + i) = mask;
-			break;
-		}
-	}
-}
-
 void* allocateExecutableMemory(size_t size)
 {
 
@@ -160,9 +122,9 @@ struct RandomX_generator
 			for(size_t i = 0; i < N; i++)
 			{
 				printer::inst()->print_msg(LDEBUG,"%s create vm", POW(ALGO).Name().c_str());
-				ctx[i]->m_rx_vm = randomx_create_vm(static_cast<randomx_flags>(flags), nullptr, randomX_global_ctx::inst().getDataset(), ctx[i]->long_state);
+				ctx[i]->m_rx_vm = randomx_create_vm(static_cast<randomx_flags>(flags), nullptr, randomX_global_ctx::inst().getDataset(ctx[i]->numa), ctx[i]->long_state);
 				if (!ctx[i]->m_rx_vm)
-					ctx[i]->m_rx_vm = randomx_create_vm(static_cast<randomx_flags>(flags - RANDOMX_FLAG_LARGE_PAGES), nullptr, randomX_global_ctx::inst().getDataset(), ctx[i]->long_state);
+					ctx[i]->m_rx_vm = randomx_create_vm(static_cast<randomx_flags>(flags - RANDOMX_FLAG_LARGE_PAGES), nullptr, randomX_global_ctx::inst().getDataset(ctx[i]->numa), ctx[i]->long_state);
 			}
 		}
 		else if(algorithm_switched)
diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
index 508dac047..d9175a638 100644
--- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
+++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp
@@ -191,7 +191,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al
 	size_t hashMemSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		hashMemSize = std::max(hashMemSize, algo.Mem());
+		hashMemSize = std::max(hashMemSize, algo.L3());
 	}
 
 	cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096);
@@ -282,7 +282,7 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx)
 	size_t hashMemSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		hashMemSize = std::max(hashMemSize, algo.Mem());
+		hashMemSize = std::max(hashMemSize, algo.L3());
 	}
 
 	if(ctx->ctx_info[0] != 0)
diff --git a/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc b/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc
new file mode 100644
index 000000000..6bb87c8f9
--- /dev/null
+++ b/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc
@@ -0,0 +1,18 @@
+	mov rcx, rbp                       ;# ecx = ma
+	shr rcx, 32
+	and ecx, RANDOMX_DATASET_BASE_MASK
+	xor rbp, rax                       ;# modify "mx"
+	mov rax, qword ptr [rdi+rcx]
+	mov edx, ebp                       ;# edx = mx
+	and edx, RANDOMX_DATASET_BASE_MASK
+	prefetchnta byte ptr [rdi+rdx]
+	ror rbp, 32                        ;# swap "ma" and "mx"
+	xor r8,  rax
+	xor r9,  qword ptr [rdi+rcx+8]
+	xor r10, qword ptr [rdi+rcx+16]
+	xor r11, qword ptr [rdi+rcx+24]
+	xor r12, qword ptr [rdi+rcx+32]
+	xor r13, qword ptr [rdi+rcx+40]
+	xor r14, qword ptr [rdi+rcx+48]
+	xor r15, qword ptr [rdi+rcx+56]
+	
\ No newline at end of file
diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp
index bfde7d002..ffc4b1ab8 100644
--- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp
+++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp
@@ -37,6 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "crypto/randomx/reciprocal.h"
 #include "crypto/randomx/virtual_memory.hpp"
 
+#include "../../../../misc/win_msr.hpp"
+
 #ifdef _MSC_VER
 #   include <intrin.h>
 #else
@@ -224,8 +226,6 @@ namespace randomx {
 		{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
 	};
 
-	bool JitCompilerX86::BranchesWithin32B = false;
-
 	size_t JitCompilerX86::getCodeSize() {
 		return codePos < prologueSize ? 0 : codePos - prologueSize;
 	}
@@ -241,8 +241,14 @@ namespace randomx {
 #   endif
     }
 
+    std::atomic<uint64_t> JitCompilerX86::flags_set(0);
+	uint64_t JitCompilerX86::flags = 0;
     // CPU-specific tweaks
 	void JitCompilerX86::applyTweaks() {
+		
+		if(flags_set.fetch_add(1) != 0)
+			return;
+
 		int32_t info[4];
 		cpuid(0, info);
 
@@ -252,36 +258,45 @@ namespace randomx {
 		manufacturer[2] = info[2];
 		manufacturer[3] = 0;
 
-		if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
-			struct
-			{
-				unsigned int stepping : 4;
-				unsigned int model : 4;
-				unsigned int family : 4;
-				unsigned int processor_type : 2;
-				unsigned int reserved1 : 2;
-				unsigned int ext_model : 4;
-				unsigned int ext_family : 8;
-				unsigned int reserved2 : 4;
-			} processor_info;
-
-			cpuid(1, info);
-			memcpy(&processor_info, info, sizeof(processor_info));
+		struct
+		{
+			unsigned int stepping : 4;
+			unsigned int model : 4;
+			unsigned int family : 4;
+			unsigned int processor_type : 2;
+			unsigned int reserved1 : 2;
+			unsigned int ext_model : 4;
+			unsigned int ext_family : 8;
+			unsigned int reserved2 : 4;
+		} processor_info;
+		
+		cpuid(1, info);
+		memcpy(&processor_info, info, sizeof(processor_info));
 
+		if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
 			// Intel JCC erratum mitigation
 			if (processor_info.family == 6) {
 				const uint32_t model = processor_info.model | (processor_info.ext_model << 4);
 				const uint32_t stepping = processor_info.stepping;
 
 				// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
-				BranchesWithin32B =
+				set_flag(BRANCHES_WITHIN_32B,
 					((model == 0x4E) && (stepping == 0x3)) ||
 					((model == 0x55) && (stepping == 0x4)) ||
 					((model == 0x5E) && (stepping == 0x3)) ||
 					((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
 					((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
 					((model == 0xA6) && (stepping == 0x0)) ||
-					((model == 0xAE) && (stepping == 0xA));
+					((model == 0xAE) && (stepping == 0xA)));
+			}
+			load_win_msrs({ { 0x1a4, 7 } });
+		}
+		
+		if (strcmp((const char*)manufacturer, "AuthenticAMD") == 0) {
+			if(processor_info.family == 0x17)
+			{
+				set_flag(AMD_RYZEN_FAMILY, true);
+				load_win_msrs({ { 0xc0011022, 0x510000 }, { 0xc001102b, 0x1808cc16}, { 0xc0011020, 0 } });
 			}
 		}
 	}
@@ -303,8 +318,20 @@ namespace randomx {
 
 	void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
 		generateProgramPrologue(prog, pcfg);
-		memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize);
-		codePos += readDatasetSize;
+		
+		uint8_t* p;
+		uint32_t n;
+		if (check_flag(AMD_RYZEN_FAMILY)) {
+			p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
+			n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
+		}
+		else {
+			p = RandomX_CurrentConfig.codeReadDatasetTweaked;
+			n = RandomX_CurrentConfig.codeReadDatasetTweakedSize;
+		}
+		memcpy(code + codePos, p, n);
+		codePos += n;
+		
 		generateProgramEpilogue(prog, pcfg);
 	}
 
@@ -396,7 +423,7 @@ namespace randomx {
 		memcpy(code + codePos, codeLoopStore, loopStoreSize);
 		codePos += loopStoreSize;
 
-		if (BranchesWithin32B) {
+		if (check_flag(BRANCHES_WITHIN_32B)) {
 			const uint32_t branch_begin = static_cast<uint32_t>(codePos);
 			const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);
 
@@ -989,6 +1016,8 @@ namespace randomx {
 		codePos = pos;
 	}
 
+	static const uint8_t AND_OR_MOV_LDMXCSR_RYZEN[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x3B, 0x44, 0x24, 0xFC, 0x74, 0x09, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC };
+
 	void JitCompilerX86::h_CFROUND(const Instruction& instr) {
 		uint8_t* const p = code;
 		int pos = codePos;
@@ -1000,7 +1029,13 @@ namespace randomx {
 			emit(ROL_RAX, p, pos);
 			emitByte(rotate, p, pos);
 		}
-		emit(AND_OR_MOV_LDMXCSR, p, pos);
+
+		if (check_flag(AMD_RYZEN_FAMILY)) {
+			emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
+		}
+		else {
+			emit(AND_OR_MOV_LDMXCSR, p, pos);
+		}
 
 		codePos = pos;
 	}
@@ -1012,7 +1047,7 @@ namespace randomx {
 		const int reg = instr.dst;
 		int32_t jmp_offset = registerUsage[reg] - (pos + 16);
 
-		if (BranchesWithin32B) {
+		if (check_flag(BRANCHES_WITHIN_32B)) {
 			const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
 			const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));
 
diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp
index f1864018a..b47ff6ec5 100644
--- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp
+++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp
@@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cstdint>
 #include <cstring>
 #include <vector>
+#include <atomic>
 #include "crypto/randomx/common.hpp"
 
 namespace randomx {
@@ -71,7 +72,23 @@ namespace randomx {
 		uint8_t* code;
 		int32_t codePos;
 
-		static bool BranchesWithin32B;
+		static std::atomic<uint64_t> flags_set;
+		static constexpr uint64_t BRANCHES_WITHIN_32B = 1;
+		static constexpr uint64_t AMD_RYZEN_FAMILY = 2;
+		static uint64_t flags;
+
+		static inline bool check_flag(uint64_t f)
+		{
+			return (flags & f) != 0;
+		}
+		
+		static inline void set_flag(uint64_t f, bool v)
+		{
+			if(v)
+				flags |= f;
+			else
+				flags &= ~f;
+		}
 
 		static void applyTweaks();
 		void generateProgramPrologue(Program&, ProgramConfiguration&);
diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S
index c20cd7433..50019b7e5 100644
--- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S
+++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S
@@ -45,6 +45,7 @@
 .global DECL(randomx_program_loop_load)
 .global DECL(randomx_program_start)
 .global DECL(randomx_program_read_dataset)
+.global DECL(randomx_program_read_dataset_ryzen)
 .global DECL(randomx_program_read_dataset_sshash_init)
 .global DECL(randomx_program_read_dataset_sshash_fin)
 .global DECL(randomx_program_loop_store)
@@ -92,6 +93,7 @@ DECL(randomx_program_prologue_first_load):
 	and eax, RANDOMX_SCRATCHPAD_MASK
 	ror rdx, 32
 	and edx, RANDOMX_SCRATCHPAD_MASK
+	stmxcsr dword ptr [rsp-20]
 	jmp DECL(randomx_program_loop_begin)
 
 .balign 64
@@ -110,6 +112,9 @@ DECL(randomx_program_start):
 DECL(randomx_program_read_dataset):
 	#include "asm/program_read_dataset.inc"
 
+DECL(randomx_program_read_dataset_ryzen):
+	#include "asm/program_read_dataset_ryzen.inc"
+
 DECL(randomx_program_read_dataset_sshash_init):
 	#include "asm/program_read_dataset_sshash_init.inc"
 
diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm
index 73fa503ad..189c464c5 100644
--- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm
+++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm
@@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin
 PUBLIC randomx_program_loop_load
 PUBLIC randomx_program_start
 PUBLIC randomx_program_read_dataset
+PUBLIC randomx_program_read_dataset_ryzen
 PUBLIC randomx_program_read_dataset_sshash_init
 PUBLIC randomx_program_read_dataset_sshash_fin
 PUBLIC randomx_dataset_init
@@ -80,6 +81,7 @@ randomx_program_prologue_first_load PROC
 	and eax, RANDOMX_SCRATCHPAD_MASK
 	ror rdx, 32
 	and edx, RANDOMX_SCRATCHPAD_MASK
+	stmxcsr dword ptr [rsp-20]
 	jmp randomx_program_loop_begin
 randomx_program_prologue_first_load ENDP
 
@@ -103,6 +105,10 @@ randomx_program_read_dataset PROC
 	include asm/program_read_dataset.inc
 randomx_program_read_dataset ENDP
 
+randomx_program_read_dataset_ryzen PROC
+	include asm/program_read_dataset_ryzen.inc
+randomx_program_read_dataset_ryzen ENDP
+
 randomx_program_read_dataset_sshash_init PROC
 	include asm/program_read_dataset_sshash_init.inc
 randomx_program_read_dataset_sshash_init ENDP
@@ -220,4 +226,4 @@ _RANDOMX_JITX86_STATIC ENDS
 
 ENDIF
 
-END
\ No newline at end of file
+END
diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp
index 0a62c986e..b0a7c5acb 100644
--- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp
+++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp
@@ -37,6 +37,7 @@ extern "C" {
 	void randomx_program_loop_load();
 	void randomx_program_start();
 	void randomx_program_read_dataset();
+	void randomx_program_read_dataset_ryzen();
 	void randomx_program_read_dataset_sshash_init();
 	void randomx_program_read_dataset_sshash_fin();
 	void randomx_program_loop_store();
diff --git a/xmrstak/backend/cpu/crypto/randomx/randomx.cpp b/xmrstak/backend/cpu/crypto/randomx/randomx.cpp
index 2937459c1..1c6b048d2 100644
--- a/xmrstak/backend/cpu/crypto/randomx/randomx.cpp
+++ b/xmrstak/backend/cpu/crypto/randomx/randomx.cpp
@@ -152,8 +152,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
 	}
 	{
 		const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset;
-		const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
+		const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen;
 		memcpy(codeReadDatasetTweaked, a, b - a);
+		codeReadDatasetTweakedSize = b - a;
+	}
+	{
+		const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen;
+		const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
+		memcpy(codeReadDatasetRyzenTweaked, a, b - a);
+		codeReadDatasetRyzenTweakedSize = b - a;
 	}
 	{
 		const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init;
diff --git a/xmrstak/backend/cpu/crypto/randomx/randomx.h b/xmrstak/backend/cpu/crypto/randomx/randomx.h
index 6fece9a4f..4dbecaef7 100644
--- a/xmrstak/backend/cpu/crypto/randomx/randomx.h
+++ b/xmrstak/backend/cpu/crypto/randomx/randomx.h
@@ -118,7 +118,10 @@ struct RandomX_ConfigurationBase
 	rx_vec_i128 fillAes4Rx4_Key[8];
 
 	uint8_t codeShhPrefetchTweaked[20];
-	uint8_t codeReadDatasetTweaked[64];
+	uint8_t codeReadDatasetTweaked[72];
+	uint32_t codeReadDatasetTweakedSize;
+	uint8_t codeReadDatasetRyzenTweaked[72];
+	uint32_t codeReadDatasetRyzenTweakedSize;
 	uint8_t codeReadDatasetLightSshInitTweaked[68];
 	uint8_t codePrefetchScratchpadTweaked[32];
 
diff --git a/xmrstak/backend/cpu/hwlocHelper.cpp b/xmrstak/backend/cpu/hwlocHelper.cpp
new file mode 100644
index 000000000..9be5db597
--- /dev/null
+++ b/xmrstak/backend/cpu/hwlocHelper.cpp
@@ -0,0 +1,177 @@
+#include "xmrstak/misc/console.hpp"
+#include "hwlocHelper.hpp"
+
+#ifndef CONF_NO_HWLOC
+
+#include <hwloc.h>
+
+inline int
+xmrstak_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+#if HWLOC_API_VERSION >= 0x20000
+	return hwloc_set_membind(
+		topology,
+		nodeset,
+		policy,
+		flags| HWLOC_MEMBIND_BYNODESET);
+#else
+	return hwloc_set_membind_nodeset(
+		topology,
+		nodeset,
+		policy,
+		flags);
+#endif
+}
+
+hwloc_obj_t getPU(hwloc_topology_t topology, size_t puId)
+{
+	hwloc_obj_t result = nullptr;
+	int pu_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
+	uint32_t chunks = hwloc_get_nbobjs_by_depth(topology, pu_depth);
+
+	for(uint32_t i = 0; i < chunks;	i++)
+	{
+		hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, pu_depth, i);
+		if(pu->os_index == puId)
+		{
+			result = pu;
+			break;
+		}
+	}
+
+	return result;
+}
+
+/** pin memory to NUMA node and thread to given core
+ *
+ * Set the default memory policy for the current thread to bind memory to the
+ * NUMA node.
+ *
+ * @param puId core id
+ */
+void hwlocBind(size_t puId)
+{
+	int depth;
+	hwloc_topology_t topology;
+
+	hwloc_topology_init(&topology);
+	hwloc_topology_load(topology);
+
+	hwloc_bitmap_t puBitMap = hwloc_bitmap_alloc();
+	hwloc_bitmap_set(puBitMap, puId);
+	if(0 > hwloc_set_cpubind(topology, puBitMap, HWLOC_CPUBIND_THREAD))
+		printer::inst()->print_msg(L0, "hwloc: pu bind to %u failed", uint32_t(puId));
+	hwloc_bitmap_free(puBitMap);
+
+	if(!hwloc_topology_get_support(topology)->membind->set_thisthread_membind)
+	{
+		printer::inst()->print_msg(L0, "hwloc: set_thisthread_membind not supported");
+		hwloc_topology_destroy(topology);
+		return;
+	}
+
+	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
+
+	hwloc_obj_t puPtr = getPU(topology, puId);
+	if(puPtr != nullptr)
+	{
+		if(0 > xmrstak_set_membind_nodeset(
+				   topology,
+				   puPtr->nodeset,
+				   HWLOC_MEMBIND_BIND,
+				   HWLOC_MEMBIND_THREAD))
+		{
+			printer::inst()->print_msg(L0, "hwloc: can't bind memory");
+		}
+		else
+		{
+			printer::inst()->print_msg(L0, "hwloc: memory pinned");
+		}
+	}
+
+	hwloc_topology_destroy(topology);
+}
+
+
+size_t numdaId(size_t puId)
+{
+	size_t result = 0;
+
+	hwloc_topology_t topology;
+	hwloc_topology_init(&topology);
+	if(hwloc_topology_load(topology) < 0)
+		return result;
+
+	hwloc_obj_t puPtr = getPU(topology, puId);
+	if(puPtr != nullptr)
+	{
+		int numa_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+		uint32_t chunks = hwloc_get_nbobjs_by_depth(topology, numa_depth);
+		for(uint32_t i = 0; i < chunks; i++)
+		{
+			hwloc_obj_t numa = hwloc_get_obj_by_depth(topology, numa_depth, i);
+			if(hwloc_bitmap_isset(puPtr->nodeset, numa->os_index))
+			{
+				result = i;
+				printer::inst()->print_msg(LDEBUG,"PU %u is on numa %u", uint32_t(puId), i);
+				break;
+			}
+		}
+	}
+	else
+	{
+		printer::inst()->print_msg(LDEBUG,"PU %u not found", uint32_t(puId));
+	}
+
+	hwloc_topology_destroy(topology);
+	return result;
+}
+
+std::vector<hwloc_obj_t> getNumaNodes(hwloc_topology_t topology)
+{
+	int numa_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	uint32_t chunks = hwloc_get_nbobjs_by_depth(topology, numa_depth);
+	printer::inst()->print_msg(LDEBUG,"%u numa node(s) found", chunks);
+	std::vector<hwloc_obj_t> result(chunks);
+
+	for(uint32_t i = 0; i < chunks; i++)
+		result[i] = hwloc_get_obj_by_depth(topology, numa_depth, i);
+
+	return result;
+}
+
+size_t getNumNumaNodes()
+{
+	size_t result = 1;
+
+	hwloc_topology_t topology;
+	hwloc_topology_init(&topology);
+	if(hwloc_topology_load(topology) < 0)
+		return result;
+
+	std::vector<hwloc_obj_t> num_nodes = getNumaNodes(topology);
+	result = num_nodes.size();
+	if(result == 0)
+		result = 1;
+
+	hwloc_topology_destroy(topology);
+	return result;
+}
+
+#else
+
+void hwlocBind(size_t)
+{
+}
+
+size_t getNumNumaNodes()
+{
+	return 1;
+}
+
+size_t numdaId(size_t puId)
+{
+	return 0;
+}
+
+#endif
diff --git a/xmrstak/backend/cpu/hwlocHelper.hpp b/xmrstak/backend/cpu/hwlocHelper.hpp
new file mode 100644
index 000000000..12cd29e65
--- /dev/null
+++ b/xmrstak/backend/cpu/hwlocHelper.hpp
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <vector>
+#include <cstddef>
+
+#ifndef CONF_NO_HWLOC
+#	include <hwloc.h>
+#endif
+
+/** pin memory to NUMA node
+ *
+ * Set the default memory policy for the current thread to bind memory to the
+ * NUMA node.
+ *
+ * @param puId core id
+ */
+void hwlocBind(size_t puId);
+
+/** get numa node id based on a thread id
+ *
+ * @return 0 if no numa node is found, else numa id (zero based)
+ */
+size_t numdaId(size_t puId);
+
+/** number of numa nodes
+ *
+ * @return if no numa node is found 1 will be returned
+ */
+size_t getNumNumaNodes();
+
+#ifndef CONF_NO_HWLOC
+	std::vector<hwloc_obj_t> getNumaNodes(hwloc_topology_t topology);
+#endif
\ No newline at end of file
diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp
deleted file mode 100644
index 067f27975..000000000
--- a/xmrstak/backend/cpu/hwlocMemory.cpp
+++ /dev/null
@@ -1,82 +0,0 @@
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
-
-#ifndef CONF_NO_HWLOC
-
-#include "xmrstak/misc/console.hpp"
-
-#include <hwloc.h>
-
-static __hwloc_inline int
-xmrstak_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
-{
-#if HWLOC_API_VERSION >= 0x20000
-	return hwloc_set_membind(
-		topology,
-		nodeset,
-		policy,
-		flags| HWLOC_MEMBIND_BYNODESET);
-#else
-	return hwloc_set_membind_nodeset(
-		topology,
-		nodeset,
-		policy,
-		flags);
-#endif
-}
-
-/** pin memory to NUMA node
- *
- * Set the default memory policy for the current thread to bind memory to the
- * NUMA node.
- *
- * @param puId core id
- */
-void bindMemoryToNUMANode(size_t puId)
-{
-	int depth;
-	hwloc_topology_t topology;
-
-	hwloc_topology_init(&topology);
-	hwloc_topology_load(topology);
-
-	if(!hwloc_topology_get_support(topology)->membind->set_thisthread_membind)
-	{
-		printer::inst()->print_msg(L0, "hwloc: set_thisthread_membind not supported");
-		hwloc_topology_destroy(topology);
-		return;
-	}
-
-	depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU);
-
-	for(uint32_t i = 0;
-		i < hwloc_get_nbobjs_by_depth(topology, depth);
-		i++)
-	{
-		hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i);
-		if(pu->os_index == puId)
-		{
-			if(0 > xmrstak_set_membind_nodeset(
-					   topology,
-					   pu->nodeset,
-					   HWLOC_MEMBIND_BIND,
-					   HWLOC_MEMBIND_THREAD))
-			{
-				printer::inst()->print_msg(L0, "hwloc: can't bind memory");
-			}
-			else
-			{
-				printer::inst()->print_msg(L0, "hwloc: memory pinned");
-				break;
-			}
-		}
-	}
-
-	hwloc_topology_destroy(topology);
-}
-#else
-
-void bindMemoryToNUMANode(size_t)
-{
-}
-
-#endif
diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp
deleted file mode 100644
index 42fa3456f..000000000
--- a/xmrstak/backend/cpu/hwlocMemory.hpp
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-/** pin memory to NUMA node
- *
- * Set the default memory policy for the current thread to bind memory to the
- * NUMA node.
- *
- * @param puId core id
- */
-void bindMemoryToNUMANode(size_t puId);
diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp
index 5773d472d..b1e5a0497 100644
--- a/xmrstak/backend/cpu/minethd.cpp
+++ b/xmrstak/backend/cpu/minethd.cpp
@@ -35,7 +35,7 @@
 #include "xmrstak/jconf.hpp"
 #include "xmrstak/misc/executor.hpp"
 
-#include "hwlocMemory.hpp"
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
 #include "xmrstak/backend/miner_work.hpp"
 
 #ifndef CONF_NO_HWLOC
@@ -147,9 +147,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, int64_t affinity)
 
 	order_guard.wait();
 
+#if defined(CONF_NO_HWLOC) || defined(_WIN32)
 	if(affinity >= 0) //-1 means no affinity
 		if(!thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
+#endif
 }
 
 cryptonight_ctx* minethd::minethd_alloc_ctx()
@@ -276,7 +278,9 @@ bool minethd::self_test()
 				cryptonight_free_ctx(ctx[j]);
 			return false;
 		}
+		ctx[i]->numa = 0;
 	}
+	randomX_global_ctx::inst().init(0);
 
 	bool bResult = true;
 
@@ -347,7 +351,10 @@ bool minethd::self_test()
 	}
 
 	for(int i = 0; i < MAX_N; i++)
+	{
 		cryptonight_free_ctx(ctx[i]);
+	}
+	randomX_global_ctx::inst().release(0);
 
 	return bResult;
 }
@@ -545,8 +552,10 @@ void minethd::prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce)
 template <uint32_t N>
 void minethd::multiway_work_main()
 {
+	// keep init phase in some order
+	std::this_thread::sleep_for(std::chrono::milliseconds(2 * affinity));
 	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
+		hwlocBind(affinity);
 
 	order_fix.set_value();
 	std::unique_lock<std::mutex> lck(thd_aff_set);
@@ -573,10 +582,13 @@ void minethd::multiway_work_main()
 				cryptonight_free_ctx(ctx[j]);
 			win_exit(1);
 		}
+		ctx[i]->numa = affinity < 0 ? 0 : numdaId(affinity);
 		piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24);
 		piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr;
 	}
 
+	randomX_global_ctx::inst().init(ctx[0]->numa);
+
 	if(!oWork.bStall)
 		prep_multiway_work<N>(bWorkBlob, piNonce);
 
diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp
index 0559601aa..02bef121c 100644
--- a/xmrstak/backend/cryptonight.hpp
+++ b/xmrstak/backend/cryptonight.hpp
@@ -50,30 +50,13 @@ struct xmrstak_algo
 		base_algo(name_id)
 	{
 	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) :
-		algo_name(name_id),
-		base_algo(algorithm)
-	{
-	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) :
-		algo_name(name_id),
-		base_algo(algorithm),
-		iter(iteration)
-	{
-	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) :
-		algo_name(name_id),
-		base_algo(algorithm),
-		iter(iteration),
-		mem(memory)
-	{
-	}
-	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) :
+
+	xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, size_t l3, size_t l2, size_t l1) :
 		algo_name(name_id),
 		base_algo(algorithm),
-		iter(iteration),
-		mem(memory),
-		mask(mem_mask)
+		m_l3(l3),
+		m_l2(l2),
+		m_l1(l1)
 	{
 	}
 
@@ -83,7 +66,7 @@ struct xmrstak_algo
 	 */
 	bool operator==(const xmrstak_algo& other) const
 	{
-		return other.Id() == Id() && other.Mem() == Mem() && other.Iter() == Iter() && other.Mask() == Mask();
+		return other.Id() == Id() && other.L3() == L3() && other.L2() == L2() && other.L1() == L1();
 	}
 
 	bool operator==(const xmrstak_algo_id& id) const
@@ -106,17 +89,19 @@ struct xmrstak_algo
 		return algo_name;
 	}
 
-	size_t Mem() const
+	size_t L3() const
 	{
-		if(base_algo == invalid_algo)
-			return 0;
-		else
-			return mem;
+		return m_l3;
 	}
 
-	uint32_t Iter() const
+	size_t L2() const
 	{
-		return iter;
+		return m_l2;
+	}
+
+	size_t L1() const
+	{
+		return m_l1;
 	}
 
 	/** Name of the algorithm
@@ -137,36 +122,30 @@ struct xmrstak_algo
 		return get_algo_name(base_algo);
 	}
 
-	uint32_t Mask() const
-	{
-		// default is a 16 byte aligne mask
-		if(mask == 0)
-			return ((mem - 1u) / 16) * 16;
-		else
-			return mask;
-	}
+
 
 	xmrstak_algo_id algo_name = invalid_algo;
 	xmrstak_algo_id base_algo = invalid_algo;
-	uint32_t iter = 0u;
-	size_t mem = 0u;
-	uint32_t mask = 0u;
+	size_t m_l3 = 1u; // avoid diffision by zero
+	size_t m_l2 = 1u;
+	size_t m_l1 = 1u;
 };
 
 // default cryptonight
-constexpr size_t CN_MEMORY = 2 * 1024 * 1024;
-constexpr uint32_t CN_ITER = 0x80000;
-constexpr uint32_t CN_MASK = ((CN_MEMORY - 1) / 16) * 16;
+constexpr size_t _2MiB = 2 * 1024 * 1024;
+constexpr size_t _256KiB = 256 * 1024;
+constexpr size_t _16KiB = 16 * 1024;
 
 constexpr uint32_t RX_ARQMA_ITER = 0x10000;
 
 inline xmrstak_algo POW(xmrstak_algo_id algo_id)
 {
-	static std::array<xmrstak_algo, 5> pow = {{{invalid_algo, invalid_algo},
-		{randomX, randomX, CN_ITER, CN_MEMORY},
-		{randomX_loki, randomX_loki, CN_ITER, CN_MEMORY},
-		{randomX_wow, randomX_wow, CN_ITER, CN_MEMORY/2},
-		{randomX_arqma, randomX_arqma, RX_ARQMA_ITER, CN_MEMORY/8}
+	static std::array<xmrstak_algo, 5> pow = {{
+		{invalid_algo},
+		{randomX, randomX, _2MiB, _256KiB, _16KiB},
+		{randomX_loki, randomX_loki, _2MiB, _256KiB, _16KiB},
+		{randomX_wow, randomX_wow, _2MiB/2, _256KiB/2, _16KiB},
+		{randomX_arqma, randomX_arqma, _2MiB/8, _256KiB/2, _16KiB}
 	}};
 
 	return pow[algo_id];
diff --git a/xmrstak/backend/nvidia/RandomX/randomx.cu b/xmrstak/backend/nvidia/RandomX/randomx.cu
index 37a1f5fe9..601314529 100644
--- a/xmrstak/backend/nvidia/RandomX/randomx.cu
+++ b/xmrstak/backend/nvidia/RandomX/randomx.cu
@@ -33,7 +33,7 @@ void randomx_prepare(nvid_ctx *ctx, const uint8_t* seed_hash, const xmrstak_algo
         CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_dataset, dataset_size));
     }
     if (!ctx->d_long_state) {
-        ctx->d_scratchpads_size = batch_size * (miner_algo.Mem() + 64llu);
+        ctx->d_scratchpads_size = batch_size * (miner_algo.L3() + 64llu);
         CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, ctx->d_scratchpads_size));
     }
     if (!ctx->d_rx_hashes) {
@@ -52,6 +52,6 @@ void randomx_prepare(nvid_ctx *ctx, const uint8_t* seed_hash, const xmrstak_algo
 	///we do not allow switching between different randomx algorithms
     if((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0)) {
         memcpy(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash));
-        CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, getRandomXDataset(), dataset_size, cudaMemcpyHostToDevice));
+        CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, getRandomXDataset(0), dataset_size, cudaMemcpyHostToDevice));
     }
 }
diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp
index f585c9933..f4a9844c8 100644
--- a/xmrstak/backend/nvidia/minethd.cpp
+++ b/xmrstak/backend/nvidia/minethd.cpp
@@ -25,7 +25,7 @@
 #include "autoAdjust.hpp"
 #include "xmrstak/backend/cpu/crypto/cryptonight.h"
 #include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h"
-#include "xmrstak/backend/cpu/hwlocMemory.hpp"
+#include "xmrstak/backend/cpu/hwlocHelper.hpp"
 #include "xmrstak/backend/cpu/minethd.hpp"
 #include "xmrstak/backend/cryptonight.hpp"
 #include "xmrstak/jconf.hpp"
@@ -96,9 +96,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg)
 void minethd::start_mining()
 {
 	thread_work_promise.set_value();
+#if defined(CONF_NO_HWLOC) || defined(_WIN32)
 	if(this->affinity >= 0) //-1 means no affinity
 		if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity))
 			printer::inst()->print_msg(L1, "WARNING setting affinity failed.");
+#endif
 }
 
 bool minethd::self_test()
@@ -181,7 +183,7 @@ std::vector<iBackend*>* minethd::thread_starter(uint32_t threadOffset, miner_wor
 void minethd::work_main()
 {
 	if(affinity >= 0) //-1 means no affinity
-		bindMemoryToNUMANode(affinity);
+		hwlocBind(affinity);
 
 	if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1)
 	{
@@ -198,6 +200,8 @@ void minethd::work_main()
 
 	cryptonight_ctx* cpu_ctx;
 	cpu_ctx = cpu::minethd::minethd_alloc_ctx();
+	cpu_ctx->numa = affinity < 0 ? 0 : numdaId(affinity);
+	randomX_global_ctx::inst().init(cpu_ctx->numa);
 
 	// start with root algorithm and switch later if fork version is reached
 	auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription().GetMiningAlgoRoot();
@@ -272,7 +276,7 @@ void minethd::work_main()
 
 			if(ctx.d_scratchpads_size)
 			{
-				const uint32_t num_scratchpads = ctx.d_scratchpads_size / miner_algo.Mem();
+				const uint32_t num_scratchpads = ctx.d_scratchpads_size / miner_algo.L3();
 				if (h_per_round > num_scratchpads)
 				{
 					h_per_round = num_scratchpads;
diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
index b38b39676..b8b10339d 100644
--- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
+++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu
@@ -74,7 +74,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx)
 	size_t hashMemSize = 0;
 	for(const auto algo : neededAlgorithms)
 	{
-		hashMemSize = std::max(hashMemSize, algo.Mem());
+		hashMemSize = std::max(hashMemSize, algo.L3());
 	}
 
 	size_t wsize = ctx->device_blocks * ctx->device_threads;
@@ -319,7 +319,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx)
 		size_t hashMemSize = 0;
 		for(const auto algo : neededAlgorithms)
 		{
-			hashMemSize = std::max(hashMemSize, algo.Mem());
+			hashMemSize = std::max(hashMemSize, algo.L3());
 		}
 
 		const size_t dataset_size = getRandomXDatasetSize();
diff --git a/xmrstak/misc/win_msr.cpp b/xmrstak/misc/win_msr.cpp
new file mode 100644
index 000000000..6a6dd4239
--- /dev/null
+++ b/xmrstak/misc/win_msr.cpp
@@ -0,0 +1,196 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik              <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler                   <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones              <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466                 <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee                <jayddee246@gmail.com>
+ * Copyright 2018      Lee Clagett              <https://github.com/vtnerd>
+ * Copyright 2018-2019 tevador                  <tevador@gmail.com>
+ * Copyright 2018-2019 SChernykh                <https://github.com/SChernykh>
+ * Copyright 2000      Transmeta Corporation    <https://github.com/intel/msr-tools>
+ * Copyright 2004-2008 H. Peter Anvin           <https://github.com/intel/msr-tools>
+ * Copyright 2016-2019 XMRig                    <https://github.com/xmrig>, <support@xmrig.com>
+ * Copyright 2017-2019 XMR-Stak                 <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef _WIN32
+
+#include "xmrstak/misc/console.hpp"
+#include "win_msr.hpp"
+
+#include <windows.h>
+#include <string>
+#include <thread>
+
+#define SERVICE_NAME L"WinRing0_1_2_0"
+
+static SC_HANDLE hManager;
+static SC_HANDLE hService;
+
+static bool uninstall_driver()
+{
+    bool result = true;
+    DWORD err;
+    SERVICE_STATUS serviceStatus;
+    if (!ControlService(hService, SERVICE_CONTROL_STOP, &serviceStatus)) {
+        err = GetLastError();
+		printer::inst()->print_msg(L0, "Failed to stop WinRing0 driver, error %u", err);
+        result = false;
+    }
+    if (!DeleteService(hService)) {
+        err = GetLastError();
+		printer::inst()->print_msg(L0, "Failed to remove WinRing0 driver, error %u", err);
+        result = false;
+    }
+    return result;
+}
+
+static HANDLE install_driver()
+{
+    DWORD err = 0;
+
+    hManager = OpenSCManager(nullptr, nullptr, SC_MANAGER_ALL_ACCESS);
+    if (!hManager) {
+        err = GetLastError();
+		printer::inst()->print_msg(L0, "Failed to open service control manager, error %u", err);
+        return 0;
+    }
+
+    std::vector<wchar_t> dir;
+    dir.resize(MAX_PATH);
+    do {
+        dir.resize(dir.size() * 2);
+        DWORD len = GetModuleFileNameW(NULL, dir.data(), dir.size());
+        err = GetLastError();
+    } while (err == ERROR_INSUFFICIENT_BUFFER);
+
+    if (err != ERROR_SUCCESS) {
+		printer::inst()->print_msg(L0, "Failed to get path to driver, error %u", err);
+        return 0;
+    }
+
+    for (auto it = dir.end(); it != dir.begin(); --it) {
+        if ((*it == L'\\') || (*it == L'/')) {
+            ++it;
+            *it = L'\0';
+            break;
+        }
+    }
+
+    std::wstring driverPath = dir.data();
+    driverPath += L"WinRing0x64.sys";
+
+    hService = OpenServiceW(hManager, SERVICE_NAME, SERVICE_ALL_ACCESS);
+    if (hService) {
+        if (!uninstall_driver()) {
+            return 0;
+        }
+        CloseServiceHandle(hService);
+        hService = 0;
+    }
+    else {
+        err = GetLastError();
+        if (err != ERROR_SERVICE_DOES_NOT_EXIST) {
+			printer::inst()->print_msg(L0, "Failed to open WinRing0 driver, error %u", err);
+            return 0;
+        }
+    }
+
+    hService = CreateServiceW(hManager, SERVICE_NAME, SERVICE_NAME, SERVICE_ALL_ACCESS, SERVICE_KERNEL_DRIVER, SERVICE_DEMAND_START, SERVICE_ERROR_NORMAL, driverPath.c_str(), nullptr, nullptr, nullptr, nullptr, nullptr);
+    if (!hService) {
+		printer::inst()->print_msg(L0, "Failed to install WinRing0 driver, error %u", err);
+    }
+
+    if (!StartService(hService, 0, nullptr)) {
+        err = GetLastError();
+        if (err != ERROR_SERVICE_ALREADY_RUNNING) {
+			printer::inst()->print_msg(L0, "Failed to start WinRing0 driver, error %u", err);
+            return 0;
+        }
+    }
+
+    HANDLE hDriver = CreateFileW(L"\\\\.\\" SERVICE_NAME, GENERIC_READ | GENERIC_WRITE, 0, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr);
+    if (!hDriver) {
+        err = GetLastError();
+ 		printer::inst()->print_msg(L0, "Failed to connect to WinRing0 driver, error %u", err);
+        return 0;
+    }
+
+    return hDriver;
+}
+
+
+#define IOCTL_WRITE_MSR CTL_CODE(40000, 0x822, METHOD_BUFFERED, FILE_ANY_ACCESS)
+
+static bool wrmsr(HANDLE hDriver, uint32_t reg, uint64_t value) {
+    struct {
+        uint32_t reg;
+        uint32_t value[2];
+    } input;
+    static_assert(sizeof(input) == 12, "Invalid struct size for WinRing0 driver");
+
+    input.reg = reg;
+    *((uint64_t*)input.value) = value;
+
+    DWORD output;
+    DWORD k;
+    if (!DeviceIoControl(hDriver, IOCTL_WRITE_MSR, &input, sizeof(input), &output, sizeof(output), &k, nullptr)) {
+        const DWORD err = GetLastError();
+		printer::inst()->print_msg(L0, "Setting MSR %x to %llx failed.", reg, int_port(value));
+        return false;
+    }
+
+    return true;
+}
+
+void load_win_msrs(const std::vector<msr_reg>& regs)
+{
+	printer::inst()->print_msg(L0, "MSR mod: loading WinRing0 driver");
+
+    HANDLE hDriver = install_driver();
+    if (!hDriver) {
+        if (hService) {
+            uninstall_driver();
+            CloseServiceHandle(hService);
+        }
+        if (hManager) {
+            CloseServiceHandle(hManager);
+        }
+        return;
+    }
+
+    printer::inst()->print_msg(L0, "MSR mod: setting MSR register values");
+
+	std::thread wrmsr_thread([hDriver, &regs]() {
+        for (uint64_t i = 0, n = std::thread::hardware_concurrency(); i < n; ++i) {
+			SetThreadAffinityMask(GetCurrentThread(), 1ULL << i);
+			for(const msr_reg& r : regs)
+				wrmsr(hDriver, r.addr, r.val);
+        }
+    });
+    wrmsr_thread.join();
+
+    CloseHandle(hDriver);
+
+    uninstall_driver();
+
+    CloseServiceHandle(hService);
+    CloseServiceHandle(hManager);
+
+	printer::inst()->print_msg(L0, "MSR mod: all done, WinRing0 driver unloaded");
+}
+
+#endif
diff --git a/xmrstak/misc/win_msr.hpp b/xmrstak/misc/win_msr.hpp
new file mode 100644
index 000000000..520b7a454
--- /dev/null
+++ b/xmrstak/misc/win_msr.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <inttypes.h>
+#include <vector>
+
+struct msr_reg
+{
+	uint32_t addr;
+	uint64_t val;
+};
+
+#ifdef _WIN32
+void load_win_msrs(const std::vector<msr_reg>& regs);
+#else
+void load_win_msrs(const std::vector<msr_reg>& regs) {}
+#endif
+
diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp
index 9ac49d889..e7267e855 100644
--- a/xmrstak/net/jpsock.cpp
+++ b/xmrstak/net/jpsock.cpp
@@ -685,9 +685,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	/*Extensions*/
 	char sAlgo[64] = {0};
 	char sBaseAlgo[64] = {0};
-	char sIterations[32] = {0};
 	char sMemory[32] = {0};
-	char sMemAlignBytes[32] = {0};
 	char sBackend[64] = {0};
 	char sHashcount[128] = {0};
 
@@ -702,9 +700,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 		snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo.Name().c_str());
 		// the real algorithm with three degrees of freedom
 		snprintf(sBaseAlgo, sizeof(sBaseAlgo), ",\"base_algo\":\"%s\"", algo.BaseName().c_str());
-		snprintf(sIterations, sizeof(sIterations), ",\"iterations\":\"0x%08x\"", algo.Iter());
-		snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.Mem());
-		snprintf(sMemAlignBytes, sizeof(sMemAlignBytes), ",\"mask\":\"0x%08x\"", algo.Mask());
+		snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.L3());
 	}
 
 	bin2hex((unsigned char*)&iNonce, 4, sNonce);
@@ -713,8 +709,8 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes
 	bin2hex(bResult, 32, sResult);
 	sResult[64] = '\0';
 
-	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n",
-		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations, sMemory, sMemAlignBytes);
+	snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s},\"id\":1}\n",
+		sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sMemory);
 
 	uint64_t messageId = 0;
 	opq_json_val oResult(nullptr);
diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp
index 761ebed58..dec78d1b3 100644
--- a/xmrstak/version.cpp
+++ b/xmrstak/version.cpp
@@ -20,7 +20,7 @@
 #endif
 
 #define XMR_STAK_NAME "xmr-stak-rx"
-#define XMR_STAK_VERSION "1.0.3-rx"
+#define XMR_STAK_VERSION "1.0.4-rx"
 
 #if defined(_WIN32)
 #define OS_TYPE "win"