diff --git a/CMakeLists.txt b/CMakeLists.txt index f1fb146e5..79a0fab34 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -542,10 +542,19 @@ target_link_libraries(xmr-stak-rx ${MHTD} ${LIBS} xmr-stak-rx-backend) # Install ################################################################################ +# install booster script +if(NOT WIN32) + install(DIRECTORY "${CMAKE_SOURCE_DIR}/scripts/" DESTINATION ${CMAKE_INSTALL_PREFIX}/${LIBRARY_OUTPUT_PATH} + FILES_MATCHING PATTERN "*.sh" + PERMISSIONS OWNER_EXECUTE OWNER_READ OWNER_WRITE GROUP_READ GROUP_EXECUTE + PATTERN .git EXCLUDE + ) +endif() + # do not install the binary if the project and install are equal if( NOT CMAKE_INSTALL_PREFIX STREQUAL PROJECT_BINARY_DIR ) install(TARGETS xmr-stak-rx - RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}") + RUNTIME DESTINATION "${CMAKE_INSTALL_PREFIX}/${EXECUTABLE_OUTPUT_PATH}") if(CUDA_FOUND) if(WIN32) install(TARGETS xmrstakrx_cuda_backend diff --git a/doc/tuning.md b/doc/tuning.md index 19427b4b2..d7151a61f 100644 --- a/doc/tuning.md +++ b/doc/tuning.md @@ -2,6 +2,7 @@ ## Content Overview * [Fast Startup](#fast-startup) +* [Booster Script](#booster-script) * [Huge Page Support](#huge-page-support) * [Benchmark](#benchmark) * [Windows](#windows) @@ -24,9 +25,19 @@ ## Fast Startup You can disable the miner self test performed on each miner start by adding the command line option `--noTest` +## Booster Script + +The linux booster script manipulates the CPU caching behavior and activates huge pages. + +Call `sudo ./randomx_booster.sh`. + +The booster script will try to install all dependencies if need. +If the script can not install the dependencies (e.g. unknown systems) please install the tools `wrmsr` and `numactl`. + ## Huge Page Support In Linux you can enable 2 MiB huge pages with the following command. +In linux you can use our [booster script](#booster-script) to active huge pages ``` sudo sysctl -w vm.nr_hugepages=1300 diff --git a/scripts/randomx_booster.sh b/scripts/randomx_booster.sh new file mode 100755 index 000000000..f36f38c19 --- /dev/null +++ b/scripts/randomx_booster.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# based on xmrig's randomx_boost.sh script +# lifted by psychocrypt + +function help() +{ + echo "$(basename $0) modifies caching behaviors of your CPU" + echo "and activates huge pages." + echo "Reboot your system to revert the changes." + echo "" + echo "must be called with administrative privileges e.g. 'sudo $(basename $0)'" +} + +if [ $# -ge 1 ] ; then + help + exit 1 +fi + +hasAptGet=$(which apt-get >/dev/null && { echo 1; } || { echo 0; }) +hasApt=$(which apt >/dev/null && { echo 1; } || { echo 0; }) +hasYum=$(which yum >/dev/null && { echo 1; } || { echo 0; }) + +tools=$(which wrmsr >/dev/null || { echo "msr-tools "; })$(which numactl >/dev/null || { echo " numactl"; }) + +if [ -n "$tools" ] ; then + echo "tool '$tools' not found, $(basename $0) is trying to install the dependency" + if [ $hasAptGet -eq 1 ] ; then + comm="apt-get --no-install-recommends --yes install $tools" + echo "execute: $comm" + $comm + elif [ $hasApt -eq 1 ] ; then + comm="apt-get --no-install-recommends --yes install $tools" + echo "execute: $comm" + $comm + elif [ $hasYum -eq 1 ] ; then + comm="yum install -y $tools" + echo "execute: $comm" + $comm + else + echo "package manager unknown, please install '$tools' by hand" >&2 + exit 1 + fi +fi + +hasWrmsr=$(which wrmsr >/dev/null && { echo 1; } || { echo 0; }) +if [ $hasWrmsr -eq 0 ] ; then + echo "dependency 'wrmsr' not found, tool failed" >&2 +exit 1 +fi + +hasNumactl=$(which numactl >/dev/null && { echo 1; } || { echo 0; }) +if [ $hasNumactl -eq 0 ] ; then + echo "dependency 'numactl' not found, tool failed" >&2 + exit 1 +fi + +echo "" +modprobe msr + +if cat /proc/cpuinfo | grep -q "AMD Ryzen" ; then + echo "Detected Ryzen" + wrmsr -a 0xc0011022 0x510000 + wrmsr -a 0xc001102b 0x1808cc16 + wrmsr -a 0xc0011020 0 + echo "MSR register values for Ryzen applied" + echo "WARNING: MSR register changes can result into stability issues!" + echo "Reboot your system to revert the changes." +elif cat /proc/cpuinfo | grep -q "Intel" ; then + echo "Detected Intel" + wrmsr -a 0x1a4 7 + echo "MSR register values for Intel applied" + echo "WARNING: MSR register changes can result into stability issues!" + echo "Reboot your system to revert the changes." +else + echo "No supported CPU detected" +fi + +echo "" + +### begin enable huge pages +required_num_huge_pages=1280 +num_huge_pages=$(cat /proc/meminfo | grep "HugePages_Free" | sed 's/ \{2,\}/ /g' | cut -d" " -f2) + +if [ $num_huge_pages -lt $required_num_huge_pages ] ; then + echo "active 2 MiB pages" + echo "execute: sysctl -w vm.nr_hugepages=$required_num_huge_pages" + sysctl -w vm.nr_hugepages="$required_num_huge_pages" +fi +# verify number of huge pages +num_huge_pages=$(cat /proc/meminfo | grep "HugePages_Free" | sed 's/ \{2,\}/ /g' | cut -d" " -f2) +num_memsets=$((num_huge_pages/required_num_huge_pages)) + +if [ $num_memsets -eq 0 ] ; then + echo "Error: not enough 2 MiB pages $num_huge_pages/$required_num_huge_pages" >&2 +fi + +# apply gigabyte pages last because 2MiB pages will give more performance +numNodes=$(numactl --hardware | grep available | cut -d" " -f2) +freeGigPages=$(cat /sys/kernel/mm/hugepages/hugepages-1048576kB/free_hugepages) +neededGigPages=$((numNodes * 3)) + +if [ $freeGigPages -lt $neededGigPages ] ; then + echo "" + echo "activate 1 GiB pages" + comm="echo $neededGigPages > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages" + echo "execute: $comm" + echo "$neededGigPages" > /sys/kernel/mm/hugepages/hugepages-1048576kB/nr_hugepages +fi +### end enable huge pages + +exit 0 diff --git a/xmrstak/backend/amd/amd_gpu/gpu.cpp b/xmrstak/backend/amd/amd_gpu/gpu.cpp index 7d9484999..c15b1cb4d 100644 --- a/xmrstak/backend/amd/amd_gpu/gpu.cpp +++ b/xmrstak/backend/amd/amd_gpu/gpu.cpp @@ -164,7 +164,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ size_t scratchPadSize = 0; for(const auto algo : neededAlgorithms) { - scratchPadSize = std::max(scratchPadSize, algo.Mem()); + scratchPadSize = std::max(scratchPadSize, algo.L3()); } size_t g_thd = ctx->rawIntensity; @@ -182,7 +182,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY, dataset_size, nullptr, &ret); } else { - void* dataset = getRandomXDataset(); + void* dataset = getRandomXDataset(0); ctx->rx_dataset[ctx->deviceIdx] = clCreateBuffer(opencl_ctx, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, dataset_size, dataset, &ret); } @@ -193,7 +193,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ } } - ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.Mem() + 64) * g_thd, nullptr, &ret); + ctx->rx_scratchpads = clCreateBuffer(opencl_ctx, CL_MEM_READ_WRITE, (user_algo.L3() + 64) * g_thd, nullptr, &ret); if(ret != CL_SUCCESS) { printer::inst()->print_msg(L1, "Error %s when calling clCreateBuffer to create RandomX scratchpads.", err_to_str(ret)); @@ -294,9 +294,7 @@ size_t InitOpenCLGpu(cl_context opencl_ctx, GpuContext* ctx, const char* source_ for(const auto miner_algo : neededAlgorithms) { // scratchpad size for the selected mining algorithm - size_t hashMemSize = miner_algo.Mem(); - int threadMemMask = miner_algo.Mask(); - int hashIterations = miner_algo.Iter(); + size_t hashMemSize = miner_algo.L3(); std::string options; options += " -DALGO=" + std::to_string(miner_algo.Id()); @@ -1364,7 +1362,7 @@ uint64_t interleaveAdjustDelay(GpuContext* ctx, const bool enableAutoAdjustment) size_t RXSetJob(GpuContext *ctx, uint8_t *input, size_t input_len, uint64_t target, const uint8_t* seed_hash, const xmrstak_algo& miner_algo) { cl_int ret; - void* dataset = getRandomXDataset(); + void* dataset = getRandomXDataset(0); const size_t dataset_size = getRandomXDatasetSize(); if((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0)) diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index 6b9c95462..5167815cc 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -84,7 +84,7 @@ class autoAdjust size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + hashMemSize = std::max(hashMemSize, algo.L3()); } std::string conf; @@ -171,9 +171,10 @@ class autoAdjust ctx.gcnAsm = false; - if(hashMemSize < CN_MEMORY) + size_t _2MiB = 2llu * 1024 * 1024; + if(hashMemSize < _2MiB) { - size_t factor = CN_MEMORY / hashMemSize; + size_t factor = _2MiB / hashMemSize; // increase all intensity relative to the original scratchpad size maxThreads *= factor; } diff --git a/xmrstak/backend/amd/minethd.cpp b/xmrstak/backend/amd/minethd.cpp index 3631e3e1d..08d80ca42 100644 --- a/xmrstak/backend/amd/minethd.cpp +++ b/xmrstak/backend/amd/minethd.cpp @@ -27,7 +27,7 @@ #include "xmrstak/backend/cpu/crypto/cryptonight.h" #include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" -#include "xmrstak/backend/cpu/hwlocMemory.hpp" +#include "xmrstak/backend/cpu/hwlocHelper.hpp" #include "xmrstak/backend/cpu/minethd.hpp" #include "xmrstak/jconf.hpp" #include "xmrstak/misc/configEditor.hpp" @@ -68,9 +68,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, GpuContext* ctx, const jconf::th order_guard.wait(); +#if defined(CONF_NO_HWLOC) || defined(_WIN32) if(affinity >= 0) //-1 means no affinity if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity)) printer::inst()->print_msg(L1, "WARNING setting affinity failed."); +#endif } extern "C" @@ -164,7 +166,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor void minethd::work_main() { if(affinity >= 0) //-1 means no affinity - bindMemoryToNUMANode(affinity); + hwlocBind(affinity); order_fix.set_value(); std::unique_lock lck(thd_aff_set); @@ -173,6 +175,8 @@ void minethd::work_main() cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); + cpu_ctx->numa = affinity < 0 ? 0 : numdaId(affinity); + randomX_global_ctx::inst().init(cpu_ctx->numa); if(cpu_ctx == nullptr) { diff --git a/xmrstak/backend/cpu/autoAdjust.hpp b/xmrstak/backend/cpu/autoAdjust.hpp index caa3530ac..1602e631f 100644 --- a/xmrstak/backend/cpu/autoAdjust.hpp +++ b/xmrstak/backend/cpu/autoAdjust.hpp @@ -31,7 +31,7 @@ class autoAdjust size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + hashMemSize = std::max(hashMemSize, algo.L3()); } const size_t hashMemSizeKB = hashMemSize / 1024u; diff --git a/xmrstak/backend/cpu/autoAdjustHwloc.hpp b/xmrstak/backend/cpu/autoAdjustHwloc.hpp index f06244c8a..61dacad54 100644 --- a/xmrstak/backend/cpu/autoAdjustHwloc.hpp +++ b/xmrstak/backend/cpu/autoAdjustHwloc.hpp @@ -4,6 +4,7 @@ #include "xmrstak/misc/configEditor.hpp" #include "xmrstak/misc/console.hpp" #include "xmrstak/params.hpp" +#include "xmrstak/backend/cpu/hwlocHelper.hpp" #ifdef _WIN32 #include @@ -15,6 +16,7 @@ #include #include +#include namespace xmrstak { @@ -30,9 +32,9 @@ class autoAdjustHwloc for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + l3MemRequire = std::max(l3MemRequire, algo.L3()); + l2MemRequire = std::max(l2MemRequire, algo.L2()); } - halfHashMemSize = hashMemSize / 2u; } bool printConfig() @@ -40,7 +42,8 @@ class autoAdjustHwloc hwloc_topology_t topology; hwloc_topology_init(&topology); - hwloc_topology_load(topology); + if(hwloc_topology_load(topology) < 0) + return false; std::string conf; configEditor configTpl{}; @@ -54,25 +57,24 @@ class autoAdjustHwloc bool is_successful = true; try { - std::vector tlcs; - tlcs.reserve(16); - results.reserve(16); + std::vector tlcs; findChildrenCaches(hwloc_get_root_obj(topology), [&tlcs](hwloc_obj_t found) { tlcs.emplace_back(found); }); if(tlcs.size() == 0) throw(std::runtime_error("The CPU doesn't seem to have a cache.")); - + printer::inst()->print_msg(LDEBUG,"process %u cache elements", uint32_t(tlcs.size())); for(hwloc_obj_t obj : tlcs) processTopLevelCache(obj); - for(uint32_t id : results) + + for(const auto& thd : threads) { conf += std::string(" { \"low_power_mode\" : "); - conf += std::string((id & 0x8000000) != 0 ? "true" : "false"); + conf += std::to_string(thd.num_hashes); conf += std::string(", \"affine_to_cpu\" : "); - conf += std::to_string(id & 0x7FFFFFF); + conf += std::to_string(thd.core_id); conf += std::string(" },\n"); } } @@ -92,10 +94,20 @@ class autoAdjustHwloc } private: - size_t hashMemSize = 0; - size_t halfHashMemSize = 0; + size_t l3MemRequire = 0; + size_t l2MemRequire = 0; + + struct Thread + { + Thread(const uint32_t c_id, const uint32_t n_hash) : + core_id(c_id), num_hashes(n_hash) + {} + + uint32_t core_id = 0; + uint32_t num_hashes = 1; + }; - std::vector results; + std::vector threads; template inline void findChildrenByType(hwloc_obj_t obj, hwloc_obj_type_t type, func lambda) @@ -143,16 +155,16 @@ class autoAdjustHwloc if(obj->attr == nullptr) throw(std::runtime_error("Cache object hasn't got attributes.")); - size_t PUs = 0; - findChildrenByType(obj, HWLOC_OBJ_PU, [&PUs](hwloc_obj_t found) { PUs++; }); + size_t numPUs = 0; + findChildrenByType(obj, HWLOC_OBJ_PU, [&numPUs](hwloc_obj_t found) { numPUs++; }); //Strange case, but we will handle it silently, surely there must be one PU somewhere? - if(PUs == 0) + if(numPUs == 0) return; if(obj->attr->cache.size == 0) { - //We will always have one child if PUs > 0 + //We will always have one child if numPUs > 0 if(!isCacheObject(obj->children[0])) throw(std::runtime_error("The CPU doesn't seem to have a cache.")); @@ -162,27 +174,58 @@ class autoAdjustHwloc return; } - size_t cacheSize = obj->attr->cache.size; - if(isCacheExclusive(obj)) + size_t l3CacheSize = obj->attr->cache.size; + size_t numL2Caches = obj->arity; + bool isExclusive = isCacheExclusive(obj); + size_t l2CacheSize = 0u; + if(obj->attr->cache.depth == 3) { - for(size_t i = 0; i < obj->arity; i++) + for(size_t i = 0; i < numL2Caches; i++) { hwloc_obj_t l2obj = obj->children[i]; - //If L2 is exclusive and greater or equal to 2MB add room for one more hash - if(isCacheObject(l2obj) && l2obj->attr != nullptr && l2obj->attr->cache.size >= hashMemSize) - cacheSize += hashMemSize; + if(isCacheObject(l2obj) && l2obj->attr) + { + //If L3 is exclusive and greater or equal to 2MB add room for one more hash + if(isExclusive && l2obj->attr->cache.size >= l3MemRequire) + l3CacheSize += l3MemRequire; + else + l2CacheSize += l2obj->attr->cache.size; + } } } + size_t l2CacheSizePerHash = l2CacheSize / numL2Caches; + printer::inst()->print_msg(LDEBUG,"%u L3 cache, required per hash %u", uint32_t(l3CacheSize), uint32_t(l3MemRequire)); + printer::inst()->print_msg(LDEBUG,"%u L2 cache, required per hash %u", uint32_t(l2CacheSize), uint32_t(l2MemRequire)); + + size_t l3CacheHashes = std::max(l3CacheSize / l3MemRequire, size_t(1u)); + size_t l2CacheHashes = std::max(l2CacheSizePerHash / l2MemRequire, size_t(1u)) * numL2Caches; + + // we have no lvl2 cache or our top lvl cache is L2 + if(l2CacheSize == 0u) + l2CacheHashes = l3CacheHashes; + std::vector cores; cores.reserve(16); findChildrenByType(obj, HWLOC_OBJ_CORE, [&cores](hwloc_obj_t found) { cores.emplace_back(found); }); - size_t cacheHashes = (cacheSize + halfHashMemSize) / hashMemSize; + printer::inst()->print_msg(LDEBUG,"%u L3 hash limit", uint32_t(l3CacheHashes)); + printer::inst()->print_msg(LDEBUG,"%u L2 hash limit", uint32_t(l2CacheHashes)); + printer::inst()->print_msg(LDEBUG,"%u PU(s) available", uint32_t(numPUs)); + size_t numHashCacheLimited = std::min(l2CacheHashes, l3CacheHashes); + // do not use more PUs than available + size_t usePus = std::min(numHashCacheLimited, numPUs); + + // currently do not use multi hash per PU (all tests has shown it is slower) + //size_t numHashesPerPu = std::max(numHashCacheLimited / numPUs, size_t(1u)); + size_t numHashesPerPu = 1u; + + printer::inst()->print_msg(LDEBUG,"use %u PU(s)", uint32_t(usePus)); + printer::inst()->print_msg(LDEBUG,"use %u hashe(s) per pu", uint32_t(numHashesPerPu)); //Firstly allocate PU 0 of every CORE, then PU 1 etc. size_t pu_id = 0; - while(cacheHashes > 0 && PUs > 0) + while(usePus > 0) { bool allocated_pu = false; for(hwloc_obj_t core : cores) @@ -192,19 +235,12 @@ class autoAdjustHwloc size_t os_id = core->children[pu_id]->os_index; - if(cacheHashes > PUs) - { - cacheHashes -= 2; - os_id |= 0x8000000; //double hash marker bit - } - else - cacheHashes--; - PUs--; - allocated_pu = true; - results.emplace_back(os_id); + threads.emplace_back(Thread(os_id, numHashesPerPu)); + + usePus--; - if(cacheHashes == 0) + if(usePus == 0) break; } diff --git a/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp b/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp index 68e970839..e53aced5e 100644 --- a/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp +++ b/xmrstak/backend/cpu/crypto/common/VirtualMemory_unix.cpp @@ -37,6 +37,11 @@ # include #endif +#if defined(__linux__) && !defined(MAP_HUGE_SHIFT) +# include +#endif + +#include "xmrstak/misc/console.hpp" int xmrstak::VirtualMemory::m_globalFlags = 0; @@ -109,8 +114,7 @@ void *xmrstak::VirtualMemory::allocateLargePagesMemory(size_t size, size_t page_ page_size_flags |= MAP_HUGE_2MB; else if(page_size == 1024u) page_size_flags |= MAP_HUGE_1GB; - #define MAP_HUGE_2MB (21 << MAP_HUGE_SHIFT) - #define MAP_HUGE_1GB (30 << MAP_HUGE_SHIFT) + void *mem = mmap(0, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE | page_size_flags, 0, 0); # endif @@ -128,7 +132,16 @@ void xmrstak::VirtualMemory::flushInstructionCache(void *p, size_t size) void xmrstak::VirtualMemory::freeLargePagesMemory(void *p, size_t size) { - munmap(p, size); + if(munmap(p, size) != 0) + { + printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)size); + size_t page3gib = 3llu*1024*1024*1024; + printer::inst()->print_msg(LDEBUG,"try to unmap ", page3gib); + if(munmap(p, page3gib) != 0) + { + printer::inst()->print_msg(LDEBUG,"munmap failed %llu", (uint64_t)page3gib); + } + } } diff --git a/xmrstak/backend/cpu/crypto/cryptonight.h b/xmrstak/backend/cpu/crypto/cryptonight.h index 8b9e9e39f..0b5ec04e2 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight.h +++ b/xmrstak/backend/cpu/crypto/cryptonight.h @@ -13,10 +13,16 @@ #if defined _MSC_VER #define ABI_ATTRIBUTE +#include +#include +#include #else #define ABI_ATTRIBUTE __attribute__((ms_abi)) #endif +#include "xmrstak/backend/cpu/hwlocHelper.hpp" +#include "cryptonight_1.h" + struct cryptonight_ctx; typedef void (*cn_mainloop_fun)(cryptonight_ctx* ctx); @@ -34,6 +40,7 @@ struct cryptonight_ctx cn_hash_fun hash_fn = nullptr; uint8_t* fun_data = nullptr; xmrstak_algo last_algo = invalid_algo; + uint32_t numa = 0; randomx_vm* m_rx_vm = nullptr; }; @@ -53,9 +60,9 @@ struct randomX_global_ctx return *env.pGlobalCtx; } - randomx_dataset* getDataset() + randomx_dataset* getDataset(size_t numaId) { - return m_rx_dataset; + return m_rx_datasets[numaId]; } void updateDataset(const std::array& seed_hash, const uint32_t num_threads) @@ -75,7 +82,7 @@ struct randomX_global_ctx // One of the threads updates cache { - std::lock_guard g(m_rx_dataset_lock); + std::lock_guard g(m_rx_cache_lock); if(m_rx_seed_hash != seed_hash) { m_rx_seed_hash = seed_hash; @@ -87,7 +94,15 @@ struct randomX_global_ctx const uint32_t a = (randomx_dataset_item_count() * static_cast(thread_id)) / num_threads; const uint32_t b = (randomx_dataset_item_count() * (static_cast(thread_id) + 1u)) / num_threads; printer::inst()->print_msg(LDEBUG,"Thread %u start updating RandomX dataset %u %u", thread_id, a, b); - randomx_init_dataset(m_rx_dataset, m_rx_cache, a, b - a); + size_t numElements = b - a; + randomx_init_dataset(m_rx_datasets[0], m_rx_cache, a, numElements); + for(size_t i = 1; i < m_rx_datasets.size(); ++i) + { + if(m_rx_datasets[i] != nullptr) + { + memcpy((uint8_t*)getRandomXDataset(i) + a * 64u, (uint8_t*)getRandomXDataset(0) + a * 64u, numElements * 64u); + } + } printer::inst()->print_msg(LDEBUG,"Thread %u finished updating RandomX dataset", thread_id); @@ -98,42 +113,88 @@ struct randomX_global_ctx } while (m_rx_dataset_init_thread_counter.load() != 0); } -private: - randomX_global_ctx() : m_rx_dataset_init_thread_counter(0u) + void init(size_t numaId) { -#ifdef __linux__ - randomx_dataset* dataset = randomx_alloc_dataset(static_cast(RANDOMX_FLAG_LARGE_PAGES | RANDOMX_FLAG_LARGE_PAGES_1G)); - if (!dataset) { - printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 1 GiB pages failed"); -#else - randomx_dataset* dataset = nullptr; -#endif - dataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES); + std::unique_lock lck(dataset_locks[numaId]); + if(m_rx_datasets[numaId]) + { + printer::inst()->print_msg(LDEBUG,"dataset/cache already created for numa %u", uint32_t(numaId)); + return; + } + printer::inst()->print_msg(LDEBUG,"allocate dataset/cache for numa %u", uint32_t(numaId)); + #ifdef __linux__ + randomx_dataset* dataset = randomx_alloc_dataset(static_cast(RANDOMX_FLAG_LARGE_PAGES | RANDOMX_FLAG_LARGE_PAGES_1G)); if (!dataset) { - printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 2 MiB pages failed"); - dataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT); - printer::inst()->print_msg(LDEBUG,"dataset allocated without huge pages"); + printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 1 GiB pages failed"); + #else + randomx_dataset* dataset = nullptr; + #endif + dataset = randomx_alloc_dataset(RANDOMX_FLAG_LARGE_PAGES); + if (!dataset) + { + printer::inst()->print_msg(LDEBUG,"Warning: dataset allocation with 2 MiB pages failed"); + dataset = randomx_alloc_dataset(RANDOMX_FLAG_DEFAULT); + printer::inst()->print_msg(LDEBUG,"dataset allocated without huge pages"); + } + else + printer::inst()->print_msg(LDEBUG,"dataset allocated with 2 MiB pages"); + #ifdef __linux__ } else - printer::inst()->print_msg(LDEBUG,"dataset allocated with 2 MiB pages"); -#ifdef __linux__ + printer::inst()->print_msg(LDEBUG,"dataset allocated with 1 GiB pages"); + #endif + + m_rx_datasets[numaId] = dataset; } - else - printer::inst()->print_msg(LDEBUG,"dataset allocated with 1 GiB pages"); -#endif + { + std::unique_lock lck(m_rx_cache_lock); + if(numaId == 0 && m_rx_cache == nullptr) + { + m_rx_cache = randomx_alloc_cache(static_cast(RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)); + if (!m_rx_cache) { + m_rx_cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); + } + } + } + } - m_rx_cache = randomx_alloc_cache(static_cast(RANDOMX_FLAG_JIT | RANDOMX_FLAG_LARGE_PAGES)); - if (!m_rx_cache) { - m_rx_cache = randomx_alloc_cache(RANDOMX_FLAG_JIT); + void release(size_t numaId) + { + { + std::unique_lock lck(dataset_locks[numaId]); + if(!m_rx_datasets[numaId]) + { + printer::inst()->print_msg(LDEBUG,"dataset/cache for numa %u alreday released", uint32_t(numaId)); + return; + } + printer::inst()->print_msg(LDEBUG,"release dataset/cache for numa %u", uint32_t(numaId)); + randomx_release_dataset(m_rx_datasets[numaId]); + m_rx_datasets[numaId] = nullptr; } - m_rx_dataset = dataset; + { + std::unique_lock lck(m_rx_cache_lock); + if(numaId == 0 && m_rx_cache) + { + randomx_release_cache(m_rx_cache); + m_rx_cache = nullptr; + } + } + } + +private: + randomX_global_ctx() : m_rx_dataset_init_thread_counter(0u) + { + size_t numNumaNodes = getNumNumaNodes(); + m_rx_datasets.resize(numNumaNodes, nullptr); + dataset_locks.reset(new std::mutex[numNumaNodes]); } - std::mutex m_rx_dataset_lock; + std::mutex m_rx_cache_lock; randomx_cache* m_rx_cache = nullptr; - randomx_dataset* m_rx_dataset = nullptr; + std::unique_ptr dataset_locks; + std::vector m_rx_datasets; std::array m_rx_seed_hash = {{0}}; std::atomic m_rx_dataset_init_thread_counter; }; diff --git a/xmrstak/backend/cpu/crypto/cryptonight_1.cpp b/xmrstak/backend/cpu/crypto/cryptonight_1.cpp index b4b3a4e19..719faae53 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_1.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_1.cpp @@ -3,9 +3,9 @@ #include "randomx/randomx.h" -void* getRandomXDataset() +void* getRandomXDataset(const size_t numaId) { - return randomx_get_dataset_memory(randomX_global_ctx::inst().getDataset()); + return randomx_get_dataset_memory(randomX_global_ctx::inst().getDataset(numaId)); } diff --git a/xmrstak/backend/cpu/crypto/cryptonight_1.h b/xmrstak/backend/cpu/crypto/cryptonight_1.h index 73e797155..d6c28c954 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_1.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_1.h @@ -1,5 +1,5 @@ #pragma once -void* getRandomXDataset(); +void* getRandomXDataset(const size_t numaId); uint64_t getRandomXDatasetSize(); diff --git a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h index 984b2c502..10b590bf0 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_aesni.h +++ b/xmrstak/backend/cpu/crypto/cryptonight_aesni.h @@ -55,44 +55,6 @@ struct RandomX_hash namespace { -template -static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask) -{ - const uint8_t* p = reinterpret_cast(src); - - // Workaround for Visual Studio placing trampoline in debug builds. -#if defined(_MSC_VER) - if(p[0] == 0xE9) - { - p += *(int32_t*)(p + 1) + 5; - } -#endif - - size_t size = 0; - while(*(uint32_t*)(p + size) != 0xDEADC0DE) - { - ++size; - } - size += sizeof(uint32_t); - - memcpy((void*)dst, (const void*)src, size); - - uint8_t* patched_data = reinterpret_cast(dst); - for(size_t i = 0; i + sizeof(uint32_t) <= size; ++i) - { - switch(*(uint32_t*)(patched_data + i)) - { - case CN_ITER: - *(uint32_t*)(patched_data + i) = iterations; - break; - - case CN_MASK: - *(uint32_t*)(patched_data + i) = mask; - break; - } - } -} - void* allocateExecutableMemory(size_t size) { @@ -160,9 +122,9 @@ struct RandomX_generator for(size_t i = 0; i < N; i++) { printer::inst()->print_msg(LDEBUG,"%s create vm", POW(ALGO).Name().c_str()); - ctx[i]->m_rx_vm = randomx_create_vm(static_cast(flags), nullptr, randomX_global_ctx::inst().getDataset(), ctx[i]->long_state); + ctx[i]->m_rx_vm = randomx_create_vm(static_cast(flags), nullptr, randomX_global_ctx::inst().getDataset(ctx[i]->numa), ctx[i]->long_state); if (!ctx[i]->m_rx_vm) - ctx[i]->m_rx_vm = randomx_create_vm(static_cast(flags - RANDOMX_FLAG_LARGE_PAGES), nullptr, randomX_global_ctx::inst().getDataset(), ctx[i]->long_state); + ctx[i]->m_rx_vm = randomx_create_vm(static_cast(flags - RANDOMX_FLAG_LARGE_PAGES), nullptr, randomX_global_ctx::inst().getDataset(ctx[i]->numa), ctx[i]->long_state); } } else if(algorithm_switched) diff --git a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp index 508dac047..d9175a638 100644 --- a/xmrstak/backend/cpu/crypto/cryptonight_common.cpp +++ b/xmrstak/backend/cpu/crypto/cryptonight_common.cpp @@ -191,7 +191,7 @@ cryptonight_ctx* cryptonight_alloc_ctx(size_t use_fast_mem, size_t use_mlock, al size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + hashMemSize = std::max(hashMemSize, algo.L3()); } cryptonight_ctx* ptr = (cryptonight_ctx*)_mm_malloc(sizeof(cryptonight_ctx), 4096); @@ -282,7 +282,7 @@ void cryptonight_free_ctx(cryptonight_ctx* ctx) size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + hashMemSize = std::max(hashMemSize, algo.L3()); } if(ctx->ctx_info[0] != 0) diff --git a/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc b/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc new file mode 100644 index 000000000..6bb87c8f9 --- /dev/null +++ b/xmrstak/backend/cpu/crypto/randomx/asm/program_read_dataset_ryzen.inc @@ -0,0 +1,18 @@ + mov rcx, rbp ;# ecx = ma + shr rcx, 32 + and ecx, RANDOMX_DATASET_BASE_MASK + xor rbp, rax ;# modify "mx" + mov rax, qword ptr [rdi+rcx] + mov edx, ebp ;# edx = mx + and edx, RANDOMX_DATASET_BASE_MASK + prefetchnta byte ptr [rdi+rdx] + ror rbp, 32 ;# swap "ma" and "mx" + xor r8, rax + xor r9, qword ptr [rdi+rcx+8] + xor r10, qword ptr [rdi+rcx+16] + xor r11, qword ptr [rdi+rcx+24] + xor r12, qword ptr [rdi+rcx+32] + xor r13, qword ptr [rdi+rcx+40] + xor r14, qword ptr [rdi+rcx+48] + xor r15, qword ptr [rdi+rcx+56] + \ No newline at end of file diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp index bfde7d002..ffc4b1ab8 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.cpp @@ -37,6 +37,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/virtual_memory.hpp" +#include "../../../../misc/win_msr.hpp" + #ifdef _MSC_VER # include #else @@ -224,8 +226,6 @@ namespace randomx { {0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, }; - bool JitCompilerX86::BranchesWithin32B = false; - size_t JitCompilerX86::getCodeSize() { return codePos < prologueSize ? 0 : codePos - prologueSize; } @@ -241,8 +241,14 @@ namespace randomx { # endif } + std::atomic JitCompilerX86::flags_set(0); + uint64_t JitCompilerX86::flags = 0; // CPU-specific tweaks void JitCompilerX86::applyTweaks() { + + if(flags_set.fetch_add(1) != 0) + return; + int32_t info[4]; cpuid(0, info); @@ -252,36 +258,45 @@ namespace randomx { manufacturer[2] = info[2]; manufacturer[3] = 0; - if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) { - struct - { - unsigned int stepping : 4; - unsigned int model : 4; - unsigned int family : 4; - unsigned int processor_type : 2; - unsigned int reserved1 : 2; - unsigned int ext_model : 4; - unsigned int ext_family : 8; - unsigned int reserved2 : 4; - } processor_info; - - cpuid(1, info); - memcpy(&processor_info, info, sizeof(processor_info)); + struct + { + unsigned int stepping : 4; + unsigned int model : 4; + unsigned int family : 4; + unsigned int processor_type : 2; + unsigned int reserved1 : 2; + unsigned int ext_model : 4; + unsigned int ext_family : 8; + unsigned int reserved2 : 4; + } processor_info; + + cpuid(1, info); + memcpy(&processor_info, info, sizeof(processor_info)); + if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) { // Intel JCC erratum mitigation if (processor_info.family == 6) { const uint32_t model = processor_info.model | (processor_info.ext_model << 4); const uint32_t stepping = processor_info.stepping; // Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf - BranchesWithin32B = + set_flag(BRANCHES_WITHIN_32B, ((model == 0x4E) && (stepping == 0x3)) || ((model == 0x55) && (stepping == 0x4)) || ((model == 0x5E) && (stepping == 0x3)) || ((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) || ((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) || ((model == 0xA6) && (stepping == 0x0)) || - ((model == 0xAE) && (stepping == 0xA)); + ((model == 0xAE) && (stepping == 0xA))); + } + load_win_msrs({ { 0x1a4, 7 } }); + } + + if (strcmp((const char*)manufacturer, "AuthenticAMD") == 0) { + if(processor_info.family == 0x17) + { + set_flag(AMD_RYZEN_FAMILY, true); + load_win_msrs({ { 0xc0011022, 0x510000 }, { 0xc001102b, 0x1808cc16}, { 0xc0011020, 0 } }); } } } @@ -303,8 +318,20 @@ namespace randomx { void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) { generateProgramPrologue(prog, pcfg); - memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize); - codePos += readDatasetSize; + + uint8_t* p; + uint32_t n; + if (check_flag(AMD_RYZEN_FAMILY)) { + p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked; + n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize; + } + else { + p = RandomX_CurrentConfig.codeReadDatasetTweaked; + n = RandomX_CurrentConfig.codeReadDatasetTweakedSize; + } + memcpy(code + codePos, p, n); + codePos += n; + generateProgramEpilogue(prog, pcfg); } @@ -396,7 +423,7 @@ namespace randomx { memcpy(code + codePos, codeLoopStore, loopStoreSize); codePos += loopStoreSize; - if (BranchesWithin32B) { + if (check_flag(BRANCHES_WITHIN_32B)) { const uint32_t branch_begin = static_cast(codePos); const uint32_t branch_end = static_cast(branch_begin + 9); @@ -989,6 +1016,8 @@ namespace randomx { codePos = pos; } + static const uint8_t AND_OR_MOV_LDMXCSR_RYZEN[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x3B, 0x44, 0x24, 0xFC, 0x74, 0x09, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC }; + void JitCompilerX86::h_CFROUND(const Instruction& instr) { uint8_t* const p = code; int pos = codePos; @@ -1000,7 +1029,13 @@ namespace randomx { emit(ROL_RAX, p, pos); emitByte(rotate, p, pos); } - emit(AND_OR_MOV_LDMXCSR, p, pos); + + if (check_flag(AMD_RYZEN_FAMILY)) { + emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos); + } + else { + emit(AND_OR_MOV_LDMXCSR, p, pos); + } codePos = pos; } @@ -1012,7 +1047,7 @@ namespace randomx { const int reg = instr.dst; int32_t jmp_offset = registerUsage[reg] - (pos + 16); - if (BranchesWithin32B) { + if (check_flag(BRANCHES_WITHIN_32B)) { const uint32_t branch_begin = static_cast(pos + 7); const uint32_t branch_end = static_cast(branch_begin + ((jmp_offset >= -128) ? 9 : 13)); diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp index f1864018a..b47ff6ec5 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86.hpp @@ -31,6 +31,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "crypto/randomx/common.hpp" namespace randomx { @@ -71,7 +72,23 @@ namespace randomx { uint8_t* code; int32_t codePos; - static bool BranchesWithin32B; + static std::atomic flags_set; + static constexpr uint64_t BRANCHES_WITHIN_32B = 1; + static constexpr uint64_t AMD_RYZEN_FAMILY = 2; + static uint64_t flags; + + static inline bool check_flag(uint64_t f) + { + return (flags & f) != 0; + } + + static inline void set_flag(uint64_t f, bool v) + { + if(v) + flags |= f; + else + flags &= ~f; + } static void applyTweaks(); void generateProgramPrologue(Program&, ProgramConfiguration&); diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S index c20cd7433..50019b7e5 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.S @@ -45,6 +45,7 @@ .global DECL(randomx_program_loop_load) .global DECL(randomx_program_start) .global DECL(randomx_program_read_dataset) +.global DECL(randomx_program_read_dataset_ryzen) .global DECL(randomx_program_read_dataset_sshash_init) .global DECL(randomx_program_read_dataset_sshash_fin) .global DECL(randomx_program_loop_store) @@ -92,6 +93,7 @@ DECL(randomx_program_prologue_first_load): and eax, RANDOMX_SCRATCHPAD_MASK ror rdx, 32 and edx, RANDOMX_SCRATCHPAD_MASK + stmxcsr dword ptr [rsp-20] jmp DECL(randomx_program_loop_begin) .balign 64 @@ -110,6 +112,9 @@ DECL(randomx_program_start): DECL(randomx_program_read_dataset): #include "asm/program_read_dataset.inc" +DECL(randomx_program_read_dataset_ryzen): + #include "asm/program_read_dataset_ryzen.inc" + DECL(randomx_program_read_dataset_sshash_init): #include "asm/program_read_dataset_sshash_init.inc" diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm index 73fa503ad..189c464c5 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.asm @@ -36,6 +36,7 @@ PUBLIC randomx_program_loop_begin PUBLIC randomx_program_loop_load PUBLIC randomx_program_start PUBLIC randomx_program_read_dataset +PUBLIC randomx_program_read_dataset_ryzen PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init @@ -80,6 +81,7 @@ randomx_program_prologue_first_load PROC and eax, RANDOMX_SCRATCHPAD_MASK ror rdx, 32 and edx, RANDOMX_SCRATCHPAD_MASK + stmxcsr dword ptr [rsp-20] jmp randomx_program_loop_begin randomx_program_prologue_first_load ENDP @@ -103,6 +105,10 @@ randomx_program_read_dataset PROC include asm/program_read_dataset.inc randomx_program_read_dataset ENDP +randomx_program_read_dataset_ryzen PROC + include asm/program_read_dataset_ryzen.inc +randomx_program_read_dataset_ryzen ENDP + randomx_program_read_dataset_sshash_init PROC include asm/program_read_dataset_sshash_init.inc randomx_program_read_dataset_sshash_init ENDP @@ -220,4 +226,4 @@ _RANDOMX_JITX86_STATIC ENDS ENDIF -END \ No newline at end of file +END diff --git a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp index 0a62c986e..b0a7c5acb 100644 --- a/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp +++ b/xmrstak/backend/cpu/crypto/randomx/jit_compiler_x86_static.hpp @@ -37,6 +37,7 @@ extern "C" { void randomx_program_loop_load(); void randomx_program_start(); void randomx_program_read_dataset(); + void randomx_program_read_dataset_ryzen(); void randomx_program_read_dataset_sshash_init(); void randomx_program_read_dataset_sshash_fin(); void randomx_program_loop_store(); diff --git a/xmrstak/backend/cpu/crypto/randomx/randomx.cpp b/xmrstak/backend/cpu/crypto/randomx/randomx.cpp index 2937459c1..1c6b048d2 100644 --- a/xmrstak/backend/cpu/crypto/randomx/randomx.cpp +++ b/xmrstak/backend/cpu/crypto/randomx/randomx.cpp @@ -152,8 +152,15 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase() } { const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset; - const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_ryzen; memcpy(codeReadDatasetTweaked, a, b - a); + codeReadDatasetTweakedSize = b - a; + } + { + const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_ryzen; + const uint8_t* b = (const uint8_t*)&randomx_program_read_dataset_sshash_init; + memcpy(codeReadDatasetRyzenTweaked, a, b - a); + codeReadDatasetRyzenTweakedSize = b - a; } { const uint8_t* a = (const uint8_t*)&randomx_program_read_dataset_sshash_init; diff --git a/xmrstak/backend/cpu/crypto/randomx/randomx.h b/xmrstak/backend/cpu/crypto/randomx/randomx.h index 6fece9a4f..4dbecaef7 100644 --- a/xmrstak/backend/cpu/crypto/randomx/randomx.h +++ b/xmrstak/backend/cpu/crypto/randomx/randomx.h @@ -118,7 +118,10 @@ struct RandomX_ConfigurationBase rx_vec_i128 fillAes4Rx4_Key[8]; uint8_t codeShhPrefetchTweaked[20]; - uint8_t codeReadDatasetTweaked[64]; + uint8_t codeReadDatasetTweaked[72]; + uint32_t codeReadDatasetTweakedSize; + uint8_t codeReadDatasetRyzenTweaked[72]; + uint32_t codeReadDatasetRyzenTweakedSize; uint8_t codeReadDatasetLightSshInitTweaked[68]; uint8_t codePrefetchScratchpadTweaked[32]; diff --git a/xmrstak/backend/cpu/hwlocHelper.cpp b/xmrstak/backend/cpu/hwlocHelper.cpp new file mode 100644 index 000000000..9be5db597 --- /dev/null +++ b/xmrstak/backend/cpu/hwlocHelper.cpp @@ -0,0 +1,177 @@ +#include "xmrstak/misc/console.hpp" +#include "hwlocHelper.hpp" + +#ifndef CONF_NO_HWLOC + +#include + +inline int +xmrstak_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) +{ +#if HWLOC_API_VERSION >= 0x20000 + return hwloc_set_membind( + topology, + nodeset, + policy, + flags| HWLOC_MEMBIND_BYNODESET); +#else + return hwloc_set_membind_nodeset( + topology, + nodeset, + policy, + flags); +#endif +} + +hwloc_obj_t getPU(hwloc_topology_t topology, size_t puId) +{ + hwloc_obj_t result = nullptr; + int pu_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); + uint32_t chunks = hwloc_get_nbobjs_by_depth(topology, pu_depth); + + for(uint32_t i = 0; i < chunks; i++) + { + hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, pu_depth, i); + if(pu->os_index == puId) + { + result = pu; + break; + } + } + + return result; +} + +/** pin memory to NUMA node and thread to given core + * + * Set the default memory policy for the current thread to bind memory to the + * NUMA node. + * + * @param puId core id + */ +void hwlocBind(size_t puId) +{ + int depth; + hwloc_topology_t topology; + + hwloc_topology_init(&topology); + hwloc_topology_load(topology); + + hwloc_bitmap_t puBitMap = hwloc_bitmap_alloc(); + hwloc_bitmap_set(puBitMap, puId); + if(0 > hwloc_set_cpubind(topology, puBitMap, HWLOC_CPUBIND_THREAD)) + printer::inst()->print_msg(L0, "hwloc: pu bind to %u failed", uint32_t(puId)); + hwloc_bitmap_free(puBitMap); + + if(!hwloc_topology_get_support(topology)->membind->set_thisthread_membind) + { + printer::inst()->print_msg(L0, "hwloc: set_thisthread_membind not supported"); + hwloc_topology_destroy(topology); + return; + } + + depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); + + hwloc_obj_t puPtr = getPU(topology, puId); + if(puPtr != nullptr) + { + if(0 > xmrstak_set_membind_nodeset( + topology, + puPtr->nodeset, + HWLOC_MEMBIND_BIND, + HWLOC_MEMBIND_THREAD)) + { + printer::inst()->print_msg(L0, "hwloc: can't bind memory"); + } + else + { + printer::inst()->print_msg(L0, "hwloc: memory pinned"); + } + } + + hwloc_topology_destroy(topology); +} + + +size_t numdaId(size_t puId) +{ + size_t result = 0; + + hwloc_topology_t topology; + hwloc_topology_init(&topology); + if(hwloc_topology_load(topology) < 0) + return result; + + hwloc_obj_t puPtr = getPU(topology, puId); + if(puPtr != nullptr) + { + int numa_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + uint32_t chunks = hwloc_get_nbobjs_by_depth(topology, numa_depth); + for(uint32_t i = 0; i < chunks; i++) + { + hwloc_obj_t numa = hwloc_get_obj_by_depth(topology, numa_depth, i); + if(hwloc_bitmap_isset(puPtr->nodeset, numa->os_index)) + { + result = i; + printer::inst()->print_msg(LDEBUG,"PU %u is on numa %u", uint32_t(puId), i); + break; + } + } + } + else + { + printer::inst()->print_msg(LDEBUG,"PU %u not found", uint32_t(puId)); + } + + hwloc_topology_destroy(topology); + return result; +} + +std::vector getNumaNodes(hwloc_topology_t topology) +{ + int numa_depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE); + uint32_t chunks = hwloc_get_nbobjs_by_depth(topology, numa_depth); + printer::inst()->print_msg(LDEBUG,"%u numa node(s) found", chunks); + std::vector result(chunks); + + for(uint32_t i = 0; i < chunks; i++) + result[i] = hwloc_get_obj_by_depth(topology, numa_depth, i); + + return result; +} + +size_t getNumNumaNodes() +{ + size_t result = 1; + + hwloc_topology_t topology; + hwloc_topology_init(&topology); + if(hwloc_topology_load(topology) < 0) + return result; + + std::vector num_nodes = getNumaNodes(topology); + result = num_nodes.size(); + if(result == 0) + result = 1; + + hwloc_topology_destroy(topology); + return result; +} + +#else + +void hwlocBind(size_t) +{ +} + +size_t getNumNumaNodes() +{ + return 1; +} + +size_t numdaId(size_t puId) +{ + return 0; +} + +#endif diff --git a/xmrstak/backend/cpu/hwlocHelper.hpp b/xmrstak/backend/cpu/hwlocHelper.hpp new file mode 100644 index 000000000..12cd29e65 --- /dev/null +++ b/xmrstak/backend/cpu/hwlocHelper.hpp @@ -0,0 +1,33 @@ +#pragma once + +#include +#include + +#ifndef CONF_NO_HWLOC +# include +#endif + +/** pin memory to NUMA node + * + * Set the default memory policy for the current thread to bind memory to the + * NUMA node. + * + * @param puId core id + */ +void hwlocBind(size_t puId); + +/** get numa node id based on a thread id + * + * @return 0 if no numa node is found, else numa id (zero based) + */ +size_t numdaId(size_t puId); + +/** number of numa nodes + * + * @return if no numa node is found 1 will be returned + */ +size_t getNumNumaNodes(); + +#ifndef CONF_NO_HWLOC + std::vector getNumaNodes(hwloc_topology_t topology); +#endif \ No newline at end of file diff --git a/xmrstak/backend/cpu/hwlocMemory.cpp b/xmrstak/backend/cpu/hwlocMemory.cpp deleted file mode 100644 index 067f27975..000000000 --- a/xmrstak/backend/cpu/hwlocMemory.cpp +++ /dev/null @@ -1,82 +0,0 @@ -#include "xmrstak/backend/cpu/hwlocMemory.hpp" - -#ifndef CONF_NO_HWLOC - -#include "xmrstak/misc/console.hpp" - -#include - -static __hwloc_inline int -xmrstak_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) -{ -#if HWLOC_API_VERSION >= 0x20000 - return hwloc_set_membind( - topology, - nodeset, - policy, - flags| HWLOC_MEMBIND_BYNODESET); -#else - return hwloc_set_membind_nodeset( - topology, - nodeset, - policy, - flags); -#endif -} - -/** pin memory to NUMA node - * - * Set the default memory policy for the current thread to bind memory to the - * NUMA node. - * - * @param puId core id - */ -void bindMemoryToNUMANode(size_t puId) -{ - int depth; - hwloc_topology_t topology; - - hwloc_topology_init(&topology); - hwloc_topology_load(topology); - - if(!hwloc_topology_get_support(topology)->membind->set_thisthread_membind) - { - printer::inst()->print_msg(L0, "hwloc: set_thisthread_membind not supported"); - hwloc_topology_destroy(topology); - return; - } - - depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); - - for(uint32_t i = 0; - i < hwloc_get_nbobjs_by_depth(topology, depth); - i++) - { - hwloc_obj_t pu = hwloc_get_obj_by_depth(topology, depth, i); - if(pu->os_index == puId) - { - if(0 > xmrstak_set_membind_nodeset( - topology, - pu->nodeset, - HWLOC_MEMBIND_BIND, - HWLOC_MEMBIND_THREAD)) - { - printer::inst()->print_msg(L0, "hwloc: can't bind memory"); - } - else - { - printer::inst()->print_msg(L0, "hwloc: memory pinned"); - break; - } - } - } - - hwloc_topology_destroy(topology); -} -#else - -void bindMemoryToNUMANode(size_t) -{ -} - -#endif diff --git a/xmrstak/backend/cpu/hwlocMemory.hpp b/xmrstak/backend/cpu/hwlocMemory.hpp deleted file mode 100644 index 42fa3456f..000000000 --- a/xmrstak/backend/cpu/hwlocMemory.hpp +++ /dev/null @@ -1,12 +0,0 @@ -#pragma once - -#include - -/** pin memory to NUMA node - * - * Set the default memory policy for the current thread to bind memory to the - * NUMA node. - * - * @param puId core id - */ -void bindMemoryToNUMANode(size_t puId); diff --git a/xmrstak/backend/cpu/minethd.cpp b/xmrstak/backend/cpu/minethd.cpp index 5773d472d..b1e5a0497 100644 --- a/xmrstak/backend/cpu/minethd.cpp +++ b/xmrstak/backend/cpu/minethd.cpp @@ -35,7 +35,7 @@ #include "xmrstak/jconf.hpp" #include "xmrstak/misc/executor.hpp" -#include "hwlocMemory.hpp" +#include "xmrstak/backend/cpu/hwlocHelper.hpp" #include "xmrstak/backend/miner_work.hpp" #ifndef CONF_NO_HWLOC @@ -147,9 +147,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, int iMultiway, int64_t affinity) order_guard.wait(); +#if defined(CONF_NO_HWLOC) || defined(_WIN32) if(affinity >= 0) //-1 means no affinity if(!thd_setaffinity(oWorkThd.native_handle(), affinity)) printer::inst()->print_msg(L1, "WARNING setting affinity failed."); +#endif } cryptonight_ctx* minethd::minethd_alloc_ctx() @@ -276,7 +278,9 @@ bool minethd::self_test() cryptonight_free_ctx(ctx[j]); return false; } + ctx[i]->numa = 0; } + randomX_global_ctx::inst().init(0); bool bResult = true; @@ -347,7 +351,10 @@ bool minethd::self_test() } for(int i = 0; i < MAX_N; i++) + { cryptonight_free_ctx(ctx[i]); + } + randomX_global_ctx::inst().release(0); return bResult; } @@ -545,8 +552,10 @@ void minethd::prep_multiway_work(uint8_t* bWorkBlob, uint32_t** piNonce) template void minethd::multiway_work_main() { + // keep init phase in some order + std::this_thread::sleep_for(std::chrono::milliseconds(2 * affinity)); if(affinity >= 0) //-1 means no affinity - bindMemoryToNUMANode(affinity); + hwlocBind(affinity); order_fix.set_value(); std::unique_lock lck(thd_aff_set); @@ -573,10 +582,13 @@ void minethd::multiway_work_main() cryptonight_free_ctx(ctx[j]); win_exit(1); } + ctx[i]->numa = affinity < 0 ? 0 : numdaId(affinity); piHashVal[i] = (uint64_t*)(bHashOut + 32 * i + 24); piNonce[i] = (i == 0) ? (uint32_t*)(bWorkBlob + 39) : nullptr; } + randomX_global_ctx::inst().init(ctx[0]->numa); + if(!oWork.bStall) prep_multiway_work(bWorkBlob, piNonce); diff --git a/xmrstak/backend/cryptonight.hpp b/xmrstak/backend/cryptonight.hpp index 0559601aa..02bef121c 100644 --- a/xmrstak/backend/cryptonight.hpp +++ b/xmrstak/backend/cryptonight.hpp @@ -50,30 +50,13 @@ struct xmrstak_algo base_algo(name_id) { } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm) : - algo_name(name_id), - base_algo(algorithm) - { - } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration) : - algo_name(name_id), - base_algo(algorithm), - iter(iteration) - { - } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory) : - algo_name(name_id), - base_algo(algorithm), - iter(iteration), - mem(memory) - { - } - xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, uint32_t iteration, size_t memory, uint32_t mem_mask) : + + xmrstak_algo(xmrstak_algo_id name_id, xmrstak_algo_id algorithm, size_t l3, size_t l2, size_t l1) : algo_name(name_id), base_algo(algorithm), - iter(iteration), - mem(memory), - mask(mem_mask) + m_l3(l3), + m_l2(l2), + m_l1(l1) { } @@ -83,7 +66,7 @@ struct xmrstak_algo */ bool operator==(const xmrstak_algo& other) const { - return other.Id() == Id() && other.Mem() == Mem() && other.Iter() == Iter() && other.Mask() == Mask(); + return other.Id() == Id() && other.L3() == L3() && other.L2() == L2() && other.L1() == L1(); } bool operator==(const xmrstak_algo_id& id) const @@ -106,17 +89,19 @@ struct xmrstak_algo return algo_name; } - size_t Mem() const + size_t L3() const { - if(base_algo == invalid_algo) - return 0; - else - return mem; + return m_l3; } - uint32_t Iter() const + size_t L2() const { - return iter; + return m_l2; + } + + size_t L1() const + { + return m_l1; } /** Name of the algorithm @@ -137,36 +122,30 @@ struct xmrstak_algo return get_algo_name(base_algo); } - uint32_t Mask() const - { - // default is a 16 byte aligne mask - if(mask == 0) - return ((mem - 1u) / 16) * 16; - else - return mask; - } + xmrstak_algo_id algo_name = invalid_algo; xmrstak_algo_id base_algo = invalid_algo; - uint32_t iter = 0u; - size_t mem = 0u; - uint32_t mask = 0u; + size_t m_l3 = 1u; // avoid diffision by zero + size_t m_l2 = 1u; + size_t m_l1 = 1u; }; // default cryptonight -constexpr size_t CN_MEMORY = 2 * 1024 * 1024; -constexpr uint32_t CN_ITER = 0x80000; -constexpr uint32_t CN_MASK = ((CN_MEMORY - 1) / 16) * 16; +constexpr size_t _2MiB = 2 * 1024 * 1024; +constexpr size_t _256KiB = 256 * 1024; +constexpr size_t _16KiB = 16 * 1024; constexpr uint32_t RX_ARQMA_ITER = 0x10000; inline xmrstak_algo POW(xmrstak_algo_id algo_id) { - static std::array pow = {{{invalid_algo, invalid_algo}, - {randomX, randomX, CN_ITER, CN_MEMORY}, - {randomX_loki, randomX_loki, CN_ITER, CN_MEMORY}, - {randomX_wow, randomX_wow, CN_ITER, CN_MEMORY/2}, - {randomX_arqma, randomX_arqma, RX_ARQMA_ITER, CN_MEMORY/8} + static std::array pow = {{ + {invalid_algo}, + {randomX, randomX, _2MiB, _256KiB, _16KiB}, + {randomX_loki, randomX_loki, _2MiB, _256KiB, _16KiB}, + {randomX_wow, randomX_wow, _2MiB/2, _256KiB/2, _16KiB}, + {randomX_arqma, randomX_arqma, _2MiB/8, _256KiB/2, _16KiB} }}; return pow[algo_id]; diff --git a/xmrstak/backend/nvidia/RandomX/randomx.cu b/xmrstak/backend/nvidia/RandomX/randomx.cu index 37a1f5fe9..601314529 100644 --- a/xmrstak/backend/nvidia/RandomX/randomx.cu +++ b/xmrstak/backend/nvidia/RandomX/randomx.cu @@ -33,7 +33,7 @@ void randomx_prepare(nvid_ctx *ctx, const uint8_t* seed_hash, const xmrstak_algo CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_rx_dataset, dataset_size)); } if (!ctx->d_long_state) { - ctx->d_scratchpads_size = batch_size * (miner_algo.Mem() + 64llu); + ctx->d_scratchpads_size = batch_size * (miner_algo.L3() + 64llu); CUDA_CHECK(ctx->device_id, cudaMalloc(&ctx->d_long_state, ctx->d_scratchpads_size)); } if (!ctx->d_rx_hashes) { @@ -52,6 +52,6 @@ void randomx_prepare(nvid_ctx *ctx, const uint8_t* seed_hash, const xmrstak_algo ///we do not allow switching between different randomx algorithms if((memcmp(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)) != 0)) { memcpy(ctx->rx_dataset_seedhash, seed_hash, sizeof(ctx->rx_dataset_seedhash)); - CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, getRandomXDataset(), dataset_size, cudaMemcpyHostToDevice)); + CUDA_CHECK(ctx->device_id, cudaMemcpy(ctx->d_rx_dataset, getRandomXDataset(0), dataset_size, cudaMemcpyHostToDevice)); } } diff --git a/xmrstak/backend/nvidia/minethd.cpp b/xmrstak/backend/nvidia/minethd.cpp index f585c9933..f4a9844c8 100644 --- a/xmrstak/backend/nvidia/minethd.cpp +++ b/xmrstak/backend/nvidia/minethd.cpp @@ -25,7 +25,7 @@ #include "autoAdjust.hpp" #include "xmrstak/backend/cpu/crypto/cryptonight.h" #include "xmrstak/backend/cpu/crypto/cryptonight_aesni.h" -#include "xmrstak/backend/cpu/hwlocMemory.hpp" +#include "xmrstak/backend/cpu/hwlocHelper.hpp" #include "xmrstak/backend/cpu/minethd.hpp" #include "xmrstak/backend/cryptonight.hpp" #include "xmrstak/jconf.hpp" @@ -96,9 +96,11 @@ minethd::minethd(miner_work& pWork, size_t iNo, const jconf::thd_cfg& cfg) void minethd::start_mining() { thread_work_promise.set_value(); +#if defined(CONF_NO_HWLOC) || defined(_WIN32) if(this->affinity >= 0) //-1 means no affinity if(!cpu::minethd::thd_setaffinity(oWorkThd.native_handle(), affinity)) printer::inst()->print_msg(L1, "WARNING setting affinity failed."); +#endif } bool minethd::self_test() @@ -181,7 +183,7 @@ std::vector* minethd::thread_starter(uint32_t threadOffset, miner_wor void minethd::work_main() { if(affinity >= 0) //-1 means no affinity - bindMemoryToNUMANode(affinity); + hwlocBind(affinity); if(cuda_get_deviceinfo(&ctx) != 0 || cryptonight_extra_cpu_init(&ctx) != 1) { @@ -198,6 +200,8 @@ void minethd::work_main() cryptonight_ctx* cpu_ctx; cpu_ctx = cpu::minethd::minethd_alloc_ctx(); + cpu_ctx->numa = affinity < 0 ? 0 : numdaId(affinity); + randomX_global_ctx::inst().init(cpu_ctx->numa); // start with root algorithm and switch later if fork version is reached auto miner_algo = ::jconf::inst()->GetCurrentCoinSelection().GetDescription().GetMiningAlgoRoot(); @@ -272,7 +276,7 @@ void minethd::work_main() if(ctx.d_scratchpads_size) { - const uint32_t num_scratchpads = ctx.d_scratchpads_size / miner_algo.Mem(); + const uint32_t num_scratchpads = ctx.d_scratchpads_size / miner_algo.L3(); if (h_per_round > num_scratchpads) { h_per_round = num_scratchpads; diff --git a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu index b38b39676..b8b10339d 100644 --- a/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu +++ b/xmrstak/backend/nvidia/nvcc_code/cuda_extra.cu @@ -74,7 +74,7 @@ extern "C" int cryptonight_extra_cpu_init(nvid_ctx* ctx) size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + hashMemSize = std::max(hashMemSize, algo.L3()); } size_t wsize = ctx->device_blocks * ctx->device_threads; @@ -319,7 +319,7 @@ extern "C" int cuda_get_deviceinfo(nvid_ctx* ctx) size_t hashMemSize = 0; for(const auto algo : neededAlgorithms) { - hashMemSize = std::max(hashMemSize, algo.Mem()); + hashMemSize = std::max(hashMemSize, algo.L3()); } const size_t dataset_size = getRandomXDatasetSize(); diff --git a/xmrstak/misc/win_msr.cpp b/xmrstak/misc/win_msr.cpp new file mode 100644 index 000000000..6a6dd4239 --- /dev/null +++ b/xmrstak/misc/win_msr.cpp @@ -0,0 +1,196 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 tevador + * Copyright 2018-2019 SChernykh + * Copyright 2000 Transmeta Corporation + * Copyright 2004-2008 H. Peter Anvin + * Copyright 2016-2019 XMRig , + * Copyright 2017-2019 XMR-Stak , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifdef _WIN32 + +#include "xmrstak/misc/console.hpp" +#include "win_msr.hpp" + +#include +#include +#include + +#define SERVICE_NAME L"WinRing0_1_2_0" + +static SC_HANDLE hManager; +static SC_HANDLE hService; + +static bool uninstall_driver() +{ + bool result = true; + DWORD err; + SERVICE_STATUS serviceStatus; + if (!ControlService(hService, SERVICE_CONTROL_STOP, &serviceStatus)) { + err = GetLastError(); + printer::inst()->print_msg(L0, "Failed to stop WinRing0 driver, error %u", err); + result = false; + } + if (!DeleteService(hService)) { + err = GetLastError(); + printer::inst()->print_msg(L0, "Failed to remove WinRing0 driver, error %u", err); + result = false; + } + return result; +} + +static HANDLE install_driver() +{ + DWORD err = 0; + + hManager = OpenSCManager(nullptr, nullptr, SC_MANAGER_ALL_ACCESS); + if (!hManager) { + err = GetLastError(); + printer::inst()->print_msg(L0, "Failed to open service control manager, error %u", err); + return 0; + } + + std::vector dir; + dir.resize(MAX_PATH); + do { + dir.resize(dir.size() * 2); + DWORD len = GetModuleFileNameW(NULL, dir.data(), dir.size()); + err = GetLastError(); + } while (err == ERROR_INSUFFICIENT_BUFFER); + + if (err != ERROR_SUCCESS) { + printer::inst()->print_msg(L0, "Failed to get path to driver, error %u", err); + return 0; + } + + for (auto it = dir.end(); it != dir.begin(); --it) { + if ((*it == L'\\') || (*it == L'/')) { + ++it; + *it = L'\0'; + break; + } + } + + std::wstring driverPath = dir.data(); + driverPath += L"WinRing0x64.sys"; + + hService = OpenServiceW(hManager, SERVICE_NAME, SERVICE_ALL_ACCESS); + if (hService) { + if (!uninstall_driver()) { + return 0; + } + CloseServiceHandle(hService); + hService = 0; + } + else { + err = GetLastError(); + if (err != ERROR_SERVICE_DOES_NOT_EXIST) { + printer::inst()->print_msg(L0, "Failed to open WinRing0 driver, error %u", err); + return 0; + } + } + + hService = CreateServiceW(hManager, SERVICE_NAME, SERVICE_NAME, SERVICE_ALL_ACCESS, SERVICE_KERNEL_DRIVER, SERVICE_DEMAND_START, SERVICE_ERROR_NORMAL, driverPath.c_str(), nullptr, nullptr, nullptr, nullptr, nullptr); + if (!hService) { + printer::inst()->print_msg(L0, "Failed to install WinRing0 driver, error %u", err); + } + + if (!StartService(hService, 0, nullptr)) { + err = GetLastError(); + if (err != ERROR_SERVICE_ALREADY_RUNNING) { + printer::inst()->print_msg(L0, "Failed to start WinRing0 driver, error %u", err); + return 0; + } + } + + HANDLE hDriver = CreateFileW(L"\\\\.\\" SERVICE_NAME, GENERIC_READ | GENERIC_WRITE, 0, nullptr, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, nullptr); + if (!hDriver) { + err = GetLastError(); + printer::inst()->print_msg(L0, "Failed to connect to WinRing0 driver, error %u", err); + return 0; + } + + return hDriver; +} + + +#define IOCTL_WRITE_MSR CTL_CODE(40000, 0x822, METHOD_BUFFERED, FILE_ANY_ACCESS) + +static bool wrmsr(HANDLE hDriver, uint32_t reg, uint64_t value) { + struct { + uint32_t reg; + uint32_t value[2]; + } input; + static_assert(sizeof(input) == 12, "Invalid struct size for WinRing0 driver"); + + input.reg = reg; + *((uint64_t*)input.value) = value; + + DWORD output; + DWORD k; + if (!DeviceIoControl(hDriver, IOCTL_WRITE_MSR, &input, sizeof(input), &output, sizeof(output), &k, nullptr)) { + const DWORD err = GetLastError(); + printer::inst()->print_msg(L0, "Setting MSR %x to %llx failed.", reg, int_port(value)); + return false; + } + + return true; +} + +void load_win_msrs(const std::vector& regs) +{ + printer::inst()->print_msg(L0, "MSR mod: loading WinRing0 driver"); + + HANDLE hDriver = install_driver(); + if (!hDriver) { + if (hService) { + uninstall_driver(); + CloseServiceHandle(hService); + } + if (hManager) { + CloseServiceHandle(hManager); + } + return; + } + + printer::inst()->print_msg(L0, "MSR mod: setting MSR register values"); + + std::thread wrmsr_thread([hDriver, ®s]() { + for (uint64_t i = 0, n = std::thread::hardware_concurrency(); i < n; ++i) { + SetThreadAffinityMask(GetCurrentThread(), 1ULL << i); + for(const msr_reg& r : regs) + wrmsr(hDriver, r.addr, r.val); + } + }); + wrmsr_thread.join(); + + CloseHandle(hDriver); + + uninstall_driver(); + + CloseServiceHandle(hService); + CloseServiceHandle(hManager); + + printer::inst()->print_msg(L0, "MSR mod: all done, WinRing0 driver unloaded"); +} + +#endif diff --git a/xmrstak/misc/win_msr.hpp b/xmrstak/misc/win_msr.hpp new file mode 100644 index 000000000..520b7a454 --- /dev/null +++ b/xmrstak/misc/win_msr.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +struct msr_reg +{ + uint32_t addr; + uint64_t val; +}; + +#ifdef _WIN32 +void load_win_msrs(const std::vector& regs); +#else +void load_win_msrs(const std::vector& regs) {} +#endif + diff --git a/xmrstak/net/jpsock.cpp b/xmrstak/net/jpsock.cpp index 9ac49d889..e7267e855 100644 --- a/xmrstak/net/jpsock.cpp +++ b/xmrstak/net/jpsock.cpp @@ -685,9 +685,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes /*Extensions*/ char sAlgo[64] = {0}; char sBaseAlgo[64] = {0}; - char sIterations[32] = {0}; char sMemory[32] = {0}; - char sMemAlignBytes[32] = {0}; char sBackend[64] = {0}; char sHashcount[128] = {0}; @@ -702,9 +700,7 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes snprintf(sAlgo, sizeof(sAlgo), ",\"algo\":\"%s\"", algo.Name().c_str()); // the real algorithm with three degrees of freedom snprintf(sBaseAlgo, sizeof(sBaseAlgo), ",\"base_algo\":\"%s\"", algo.BaseName().c_str()); - snprintf(sIterations, sizeof(sIterations), ",\"iterations\":\"0x%08x\"", algo.Iter()); - snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.Mem()); - snprintf(sMemAlignBytes, sizeof(sMemAlignBytes), ",\"mask\":\"0x%08x\"", algo.Mask()); + snprintf(sMemory, sizeof(sMemory), ",\"scratchpad\":\"0x%08x\"", (uint32_t)algo.L3()); } bin2hex((unsigned char*)&iNonce, 4, sNonce); @@ -713,8 +709,8 @@ bool jpsock::cmd_submit(const char* sJobId, uint32_t iNonce, const uint8_t* bRes bin2hex(bResult, 32, sResult); sResult[64] = '\0'; - snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s%s%s},\"id\":1}\n", - sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sIterations, sMemory, sMemAlignBytes); + snprintf(cmd_buffer, sizeof(cmd_buffer), "{\"method\":\"submit\",\"params\":{\"id\":\"%s\",\"job_id\":\"%s\",\"nonce\":\"%s\",\"result\":\"%s\"%s%s%s%s%s},\"id\":1}\n", + sMinerId, sJobId, sNonce, sResult, sBackend, sHashcount, sAlgo, sBaseAlgo, sMemory); uint64_t messageId = 0; opq_json_val oResult(nullptr); diff --git a/xmrstak/version.cpp b/xmrstak/version.cpp index 761ebed58..dec78d1b3 100644 --- a/xmrstak/version.cpp +++ b/xmrstak/version.cpp @@ -20,7 +20,7 @@ #endif #define XMR_STAK_NAME "xmr-stak-rx" -#define XMR_STAK_VERSION "1.0.3-rx" +#define XMR_STAK_VERSION "1.0.4-rx" #if defined(_WIN32) #define OS_TYPE "win"